How to get the root of a tree without parsing the whole file?

I am creating an xml parser to parse XML reports from different tools, and each tool generates different reports with different tags.

For instance:

Arachni generates an xml report with <arachni_report></arachni_report>as a tree root tag.

nmap generates an xml report with <nmaprun></nmaprun>as a tree root tag.

I try not to analyze the whole file if it is not a valid report from any of the tools that I want.

The first thing I thought of using it was ElementTree, parse the whole XML file (suppose it contains valid xml), and then check based on the root of the tree if the report belongs to Arachni or nmap.

I am currently using cElementTree, and as far as I know, getroot () is not an option here, but my goal is to get this parser to work only with recognized files, without parsing unnecessary files.

By the way, I'm still good at parsing xml, thanks in advance.

+3
source share
6 answers

"simple string methods" is the root [intended for puns] of all evil - see examples below.

Update 2 The code and output now shows that the proposed regular expressions also do not work very well.

Use ElementTree. The function you are looking for is iterparse. Enable "trigger" events. Deflate at the first iteration.

the code:

# coding: ascii
import xml.etree.cElementTree as et
# import xml.etree.ElementTree as et
# import lxml.etree as et
from cStringIO import StringIO
import re

xml_text_1 = """\
<?xml version="1.0" ?> 
<!--  this is a comment --> 
<root
><foo>bar</foo></root
>
"""

xml_text_2 = """\
<?xml version="1.0" ?> 
<!--  this is a comment --> 
<root
><foo>bar</foo></root
>
<!--
That all, folks! 
-->
"""

xml_text_3 = '''<?xml version="1.0" ?>
<!-- <mole1> -->
<root><foo /></root>
<!-- </mole2> -->'''

xml_text_4 = '''<?xml version="1.0" ?><!-- <mole1> --><root><foo /></root><!-- </mole2> -->'''

for xml_text in (xml_text_1, xml_text_2, xml_text_3, xml_text_4):
    print
    chrstr = xml_text.strip()
    x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
    lastline = chrstr[x:]
    print "*** eyquem 1:", repr(lastline.strip())

    chrstr = xml_text.strip()
    x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
    lastline = chrstr[x+1:]
    if lastline[0:5]=='<!-- ':
        chrstr = xml_text[0:x].rstrip()
        x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
        print "*** eyquem 2:", repr(chrstr[x+1:])
    else:
        print "*** eyquem 2:", repr(lastline)

    m = None
    for m in re.finditer('^</[^>]+>', xml_text, re.MULTILINE):
        pass
    if m: print "*** eyquem 3:", repr(m.group())
    else: print "*** eyquem 3:", "FAIL"

    m = None
    for m in re.finditer('</[^>]+>', xml_text):
        pass
    if m: print "*** eyquem 4:", repr(m.group())
    else: print "*** eyquem 4:", "FAIL"

    m = re.search('^<(?![?!])[^>]+>', xml_text, re.MULTILINE)
    if m: print "*** eyquem 5:", repr(m.group())
    else: print "*** eyquem 5:", "FAIL"

    m = re.search('<(?![?!])[^>]+>', xml_text)
    if m: print "*** eyquem 6:", repr(m.group())
    else: print "*** eyquem 6:", "FAIL"

    filelike_obj = StringIO(xml_text)
    tree = et.parse(filelike_obj)
    print "*** parse:", tree.getroot().tag

    filelike_obj = StringIO(xml_text)
    for event, elem in et.iterparse(filelike_obj, ('start', 'end')):
        print "*** iterparse:", elem.tag
        break

, ElementTree, Python 2.5 2.7. Python 2.2 2.4; ElementTree cElementTree effbot.org . lxml.

:

*** eyquem 1: '>'
*** eyquem 2: '>'
*** eyquem 3: FAIL
*** eyquem 4: '</root\n>'
*** eyquem 5: '<root\n>'
*** eyquem 6: '<root\n>'
*** parse: root
*** iterparse: root

*** eyquem 1: '-->'
*** eyquem 2: '-->'
*** eyquem 3: FAIL
*** eyquem 4: '</root\n>'
*** eyquem 5: '<root\n>'
*** eyquem 6: '<root\n>'
*** parse: root
*** iterparse: root

*** eyquem 1: '<!-- </mole2> -->'
*** eyquem 2: '<root><foo /></root>'
*** eyquem 3: FAIL
*** eyquem 4: '</mole2>'
*** eyquem 5: '<root>'
*** eyquem 6: '<mole1>'
*** parse: root
*** iterparse: root

*** eyquem 1: '>'
*** eyquem 2: '<?xml version="1.0" ?><!-- <mole1> --><root><foo /></root><!-- </mole2> -->'
*** eyquem 3: FAIL
*** eyquem 4: '</mole2>'
*** eyquem 5: FAIL
*** eyquem 6: '<mole1>'
*** parse: root
*** iterparse: root

1 . ... . Python 2.7 2.2.

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import cElementTree as ET

def get_root_tag_from_xml_file(xml_file_path):
    result = f = None
    try:
        f = open(xml_file_path, 'rb')
        for event, elem in ET.iterparse(f, ('start', )):
            result = elem.tag
            break
    finally:
        if f: f.close()
    return result

if __name__ == "__main__":
    import sys, glob
    for pattern in sys.argv[1:]:
        for filename in glob.glob(pattern):
            print filename, get_root_tag_from_xml_file(filename)
+2

XML?

ch = """\
<?xml version="1.0" encoding="ISO-8859-1" ?> 
<!--  Edited by XMLSpy® --> 
<CATALOG>
 <CD>
  <TITLE>Empire Burlesque</TITLE> 
  <ARTIST>Bob Dylan</ARTIST> 
  <COUNTRY>USA</COUNTRY> 
  <COMPANY>Columbia</COMPANY> 
  <PRICE>10.90</PRICE> 
  <YEAR>1985</YEAR> 
 </CD>
 <CD>
  <TITLE>Hide your heart</TITLE> 
  <ARTIST>Bonnie Tyler</ARTIST> 
  <COUNTRY>UK</COUNTRY> 
  <COMPANY>CBS Records</COMPANY> 
  <PRICE>9.90</PRICE> 
  <YEAR>1988</YEAR> 
 </CD>
</CATALOG>
<!-- This is the end of arachni report --> 

"""

chrstr = ch.strip()
x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
lastline = chrstr[x+1:]
if lastline[0:5]=='<!-- ':
    chrstr = ch[0:x].rstrip()
    x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
    print chrstr[x+1:]
else:
    print lastline

,

</CATALOG>

,

.

, , (, 200 600 ), 200 600 long ( XL , ?)

from os.path import getsize

with open('I:\\uuu.txt') as f:

    L = getsize('I:\\uuu.txt')
    print 'L==',L

    f.seek( -min(600,L) , 2)
    ch = f.read()
    if '\r' not in ch and '\n' not in ch:
        f.seek(0,0)
        ch = f.read()        

    chrstr = ch.strip()
    x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
    lastline = chrstr[x+1:]
    if lastline[0:5]=='<!-- ':
        chrstr = ch[0:x].rstrip()
        x = max(chrstr.rfind('\r'),chrstr.rfind('\n'))
        print chrstr[x+1:]
    else:
        print lastline
0

: , , , XML, , . @eyquem : .

- , , :

f = open(the_file)
head = f.read(200)
f.close()
if "<arachni_report" in head:
    #.. re-open and parse as arachni ..
elif "<nmaprun" in head:
    #.. re-open and parse as nmaprun ..

, , , .

0

: ( ), ( ).

eyquem , :)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from cStringIO import StringIO

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

class reportRecognizer(object):

    def __init__(self, xml_report_path):
        self.report_type = ""

        root_tag = self.get_root_tag_from_xml_file(xml_report_path)

        if root_tag:
            self.report_type = self.rType(root_tag)


    def get_root_tag_from_xml_file(self, xml_file_path):
        result = f = None
        try:
            f = open(xml_file_path, 'rb')
            for event, elem in ET.iterparse(f, ('start', )):
                result = elem.tag
                break
        except IOError, err:
            print "Error while opening file.\n%s. %s" % (err, filepath)
        finally:
            if f: f.close()
        return result

    def rType(self, tag):
        if "arachni_report" == tag:
            return "Arachni XML Report"
        elif "nmaprun" == tag:
            return "Nmap XML Report"
        else:
            return "Unrecognized XML file, sorry."

report = reportRecognizer('report.xml')
print report.report_type
0

, , , ?

XML, :

W3C XML

, . :

" XML- , ; XML."

http://en.wikipedia.org/wiki/XML

The real evil is here.

, , , XML? , .

.

, , , . , , "..whisp... XML-... whisp whisp"

import re
from os.path import getsize

with open('I:\\uuu.txt') as f:

    L = getsize('I:\\uuu.txt')
    print 'L==',L

    f.seek( -min(600,L) , 2)
    for mat in re.finditer('^</[^>]+>',f.read(),re.MULTILINE):
        pass
    print mat.group()

XML.

, , , iterparse() ElementTree !

.

, , ...

import re

with open('I:\\uuu.txt') as f:
    print re.search('^<(?![?!])[^>]+>',f.read(),re.MULTILINE).group()
0

, : . , , .

, . , , , , , . .

, -, XML W3c, XML-. , . , XML debonair.

, , , , XML . , W3c, \n, \r \t XML- , .

, - : , . , \t , . , XML , : , .

, , . , XML, , , . , XML: \n XML-.

.

, , whac_moles, , .

, , .

, : , XML- . , , , . XML-: XML, . , OP, .

.

, - , , , .

, :

~~ XML

~~ -200

~~ 200

~~ , :

  • , , \n,\r,\t
  • \n,\r,\t

, iterparse. . - .

# coding: ascii
import xml.etree.cElementTree as et
# import xml.etree.ElementTree as et
# import lxml.etree as et
from cStringIO import StringIO
import re,urllib

xml5 = """\
<?xml version="1.0" ?> 
<!--  this is a comment --> 
<root\t
\r\t\r \r
><foo

>bar</foo\t \r></root
>
"""

xml6 = """\
<?xml version="1.0" ?> 
<!--  this is a comment --> 
<root
><foo
>bar</foo\n\t   \t></root \t
\r>
<!--  \r   \t
That all, folks!

\t-->
"""

xml7 = '''<?xml version="1.0" ?>
<!-- <mole1> -->  
<root><foo

\t\t\r\r\t/></root  \t
>  
<!-- </mole2>\t \r
 \r-->
<!---->
'''

xml8 = '''<?xml version="1.0" ?><!-- \r<mole1> --><root>  \t\t<foo \t\r\r/></root>\t<!-- </mole2> -->'''


sock = urllib.urlopen('http://www.cafeconleche.org/books/bible/examples/18/18-4.xsl')
xml9 = sock.read()
sock.close()


def rp(x):
    return  '\\r' if x.group()=='\r' else '\\t'

for xml_text in (xml5, xml6, xml7, xml8, xml9):

    print '\\n\n'.join(re.sub('\r|\t',rp,xml_text).split('\n'))
    print '-----------------------------'

    xml_text_noc = re.sub('<!--.*?-->|[\n\r\t]','', xml_text,flags=re.DOTALL)
    RE11 = '(?<=</)[^ >]+(?= *>)(?!.*</[^>]+>)' # with assertions   # ^
    m  = re.search(RE11, xml_text_noc,re.DOTALL)
    print "***  eyquem 11: " + repr(m.group() if m else "FAIL")

    xml_text_noc = re.sub('<!--.*?-->|[\n\r\t]','', xml_text,flags=re.DOTALL)
    RE12 = '</([^ >]+) *>(?!.*</[^>]+>)'  # with group(1)   # ^
    m  = re.search(RE12, xml_text_noc,re.DOTALL)
    print "***  eyquem 12: " + repr(m.group(1) if m else "FAIL")

    xml_text_noc = re.sub('<!--.*?-->|[\n\r\t]','', xml_text,flags=re.DOTALL)
    RE13 = '</[^ >]+ *>(?!.*</[^>]+>)' # without group(1)   # ^
    m  = re.search(RE13, xml_text_noc,re.DOTALL)
    print "***  eyquem 13: " + repr(m.group()[2:-1].rstrip() if m else "FAIL")



    xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
    RE14 = '(?<=</)[^ \n\r\t>]+(?=[ \n\r\t]*>)(?!.*</[^>]+>)' # with assertions  # ^
    m  = re.search(RE14, xml_text_noc,re.DOTALL)
    print "***  eyquem 14: " + repr(m.group() if m else "FAIL")

    xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
    RE15 = '</([^ \n\r\t>]+)[ \n\r\t]*>(?!.*</[^>]+>)'  # with group(1)   # <
    m  = re.search(RE15, xml_text_noc,re.DOTALL)
    print "***  eyquem 15: " + repr(m.group(1).rstrip() if m else "FAIL")

    xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
    RE16 = '</[^ \n\r\t>]+[ \n\r\t]*>(?!.*</[^>]+>)' # without group(1)   # <
    m  = re.search(RE16, xml_text_noc,re.DOTALL)
    print "***  eyquem 16: " + repr(m.group()[2:-1].rstrip() if m else "FAIL")



    print
    filelike_obj = StringIO(xml_text)
    tree = et.parse(filelike_obj)
    print "***      parse:  " + tree.getroot().tag

    filelike_obj = StringIO(xml_text)
    for event, elem in et.iterparse(filelike_obj, ('start', 'end')):
        print "***  iterparse:  " + elem.tag
        break


    print '\n=============================================' 

<?xml version="1.0" ?> \n
<!--  this is a comment --> \n
<root\t\n
\r\t\r \r\n
><foo\n
\n
>bar</foo\t \r></root\n
>\n

-----------------------------
***  eyquem 11: 'root'
***  eyquem 12: 'root'
***  eyquem 13: 'root'
***  eyquem 14: 'root'
***  eyquem 15: 'root'
***  eyquem 16: 'root'

***      parse:  root
***  iterparse:  root

=============================================
<?xml version="1.0" ?> \n
<!--  this is a comment --> \n
<root\n
><foo\n
>bar</foo\n
\t   \t></root \t\n
\r>\n
<!--  \r   \t\n
That all, folks!\n
\n
\t-->\n

-----------------------------
***  eyquem 11: 'root'
***  eyquem 12: 'root'
***  eyquem 13: 'root'
***  eyquem 14: 'root'
***  eyquem 15: 'root'
***  eyquem 16: 'root'

***      parse:  root
***  iterparse:  root

=============================================
<?xml version="1.0" ?>\n
<!-- <mole1> -->  \n
<root><foo\n
\n
\t\t\r\r\t/></root  \t\n
>  \n
<!-- </mole2>\t\n
-->\n
<!---->\n

-----------------------------
***  eyquem 11: 'root'
***  eyquem 12: 'root'
***  eyquem 13: 'root'
***  eyquem 14: 'root'
***  eyquem 15: 'root'
***  eyquem 16: 'root'

***      parse:  root
***  iterparse:  root

=============================================
<?xml version="1.0" ?><!-- \r<mole1> --><root>  \t\t<foo \t\r\r/></root>\t<!-- </mole2> -->
-----------------------------
***  eyquem 11: 'root'
***  eyquem 12: 'root'
***  eyquem 13: 'root'
***  eyquem 14: 'root'
***  eyquem 15: 'root'
***  eyquem 16: 'root'

***      parse:  root
***  iterparse:  root

=============================================
<?xml version="1.0"?>\r\n
<stylesheet\r\n
  xmlns="http://www.w3.org/XSL/Transform/1.0"\r\n
  xmlns:fo="http://www.w3.org/XSL/Format/1.0"\r\n
  result-ns="fo">\r\n
\r\n
  <template match="/">\r\n
    <fo:root xmlns:fo="http://www.w3.org/XSL/Format/1.0">\r\n
\r\n
      <fo:layout-master-set>\r\n
        <fo:simple-page-master page-master-name="only">\r\n
          <fo:region-body/>\r\n
        </fo:simple-page-master>\r\n
      </fo:layout-master-set>\r\n
\r\n
      <fo:page-sequence>\r\n
\r\n
       <fo:sequence-specification>\r\n
        <fo:sequence-specifier-single page-master-name="only"/>\r\n
       </fo:sequence-specification>\r\n
        \r\n
        <fo:flow>\r\n
          <apply-templates select="//ATOM"/>\r\n
        </fo:flow>\r\n
        \r\n
      </fo:page-sequence>\r\n
\r\n
    </fo:root>\r\n
  </template>\r\n
\r\n
  <template match="ATOM">\r\n
    <fo:block font-size="20pt" font-family="serif">\r\n
      <value-of select="NAME"/>\r\n
    </fo:block>\r\n
  </template>\r\n
\r\n
</stylesheet>\r\n

-----------------------------
***  eyquem 11: 'stylesheet'
***  eyquem 12: 'stylesheet'
***  eyquem 13: 'stylesheet'
***  eyquem 14: 'stylesheet'
***  eyquem 15: 'stylesheet'
***  eyquem 16: 'stylesheet'

***      parse:  {http://www.w3.org/XSL/Transform/1.0}stylesheet
***  iterparse:  {http://www.w3.org/XSL/Transform/1.0}stylesheet

=============================================

:

# coding: ascii
import xml.etree.cElementTree as et
# import xml.etree.ElementTree as et
# import lxml.etree as et
from cStringIO import StringIO
import re
import urllib
from time import clock

sock = urllib.urlopen('http://www.cafeconleche.org/books/bible/examples/18/18-4.xsl')
ch = sock.read()
sock.close()

# the following lines are intended to insert additional lines
# into the XML text before its recording in a file, in order to
# obtain a real file to use, containing an XML text 
# long enough to observe easily the timing differences

li = ch.splitlines(True)[0:6] + 30*ch.splitlines(True)[6:-2] + ch.splitlines(True)[-2:]

with open('xml_example.xml','w') as f:
    f.write(''.join(li))

print 'length of XML text in a file : ',len(''.join(li)),'\n'



# timings

P,I,A,B,C,D,E,F = [],[],[],[],[],[],[],[],


n = 50

for cnt in xrange(50):

    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as filelike_obj:
            tree = et.parse(filelike_obj)
            res_parse = tree.getroot().tag
    P.append( clock()-te)

    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as filelike_obj:
            for event, elem in et.iterparse(filelike_obj, ('start', 'end')):
                res_iterparse = elem.tag
                break
    I.append(  clock()-te)


    RE11 = '(?<=</)[^ >]+(?= *>)(?!.*</[^>]+>)' # with assertions   # ^
    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as f:
            f.seek(-200,2)
            xml_text = f.read()
            xml_text_noc = re.sub('(<!--.*?-->|[\n\r\t])','', xml_text,flags=re.DOTALL)
            m  = re.search(RE11, xml_text_noc,re.DOTALL)
            res_eyq11 = m.group() if m else "FAIL"
    A.append(  clock()-te)


    RE12 = '</([^ >]+) *>(?!.*</[^>]+>)'  # with group(1)   # ^
    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as f:
            f.seek(-200,2)
            xml_text = f.read()
            xml_text_noc = re.sub('(<!--.*?-->|[\n\r\t])','', xml_text,flags=re.DOTALL)
            m  = re.search(RE12, xml_text_noc,re.DOTALL)
            res_eyq12 = m.group(1) if m else "FAIL"
    B.append(  clock()-te)


    RE13 = '</[^ >]+ *>(?!.*</[^>]+>)' # without group(1)   # ^
    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as f:
            f.seek(-200,2)
            xml_text = f.read()
            xml_text_noc = re.sub('(<!--.*?-->|[\n\r\t])','', xml_text,flags=re.DOTALL)
            m  = re.search(RE13, xml_text_noc,re.DOTALL)
            res_eyq13 = m.group()[2:-1] if m else "FAIL"
    C.append(  clock()-te)



    RE14 = '(?<=</)[^ \n\r\t>]+(?=[ \n\r\t]*>)(?!.*</[^>]+>)' # with assertions  # ^
    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as f:
            f.seek(-200,2)
            xml_text = f.read()
            xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
            m  = re.search(RE14, xml_text_noc,re.DOTALL)
            res_eyq14 = m.group() if m else "FAIL"
    D.append(  clock()-te)


    RE15 = '</([^ \n\r\t>]+)[ \n\r\t]*>(?!.*</[^>]+>)'  # with group(1)   # <
    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as f:
            f.seek(-200,2)
            xml_text = f.read()
            xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
            m  = re.search(RE15, xml_text_noc,re.DOTALL)
            res_eyq15 = m.group(1) if m else "FAIL"
    E.append(  clock()-te)


    RE16 = '</[^ \n\r\t>]+[ \n\r\t]*>(?!.*</[^>]+>)' # without group(1)   # <
    te = clock()
    for i in xrange (n):
        with open('xml_example.xml') as f:
            f.seek(-200,2)
            xml_text = f.read()
            xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
            m  = re.search(RE16, xml_text_noc,re.DOTALL)
            res_eyq16 = m.group()[2:-1].rstrip() if m else "FAIL"
    F.append(  clock()-te)


print "***      parse:  " + res_parse, '  parse'
print "***  iterparse:  " + res_iterparse, '  iterparse'
print
print "***  eyquem 11:  " + repr(res_eyq11)
print "***  eyquem 12:  " + repr(res_eyq12)
print "***  eyquem 13:  " + repr(res_eyq13)
print "***  eyquem 14:  " + repr(res_eyq14)
print "***  eyquem 15:  " + repr(res_eyq15)
print "***  eyquem 16:  " + repr(res_eyq16)

print
print str(min(P))
print str(min(I))
print
print '\n'.join(str(u) for u in map(min,(A,B,C)))
print
print '\n'.join(str(u) for u in map(min,(D,E,F)))

:

length of XML text in a file :  22548 

***      parse:  {http://www.w3.org/XSL/Transform/1.0}stylesheet   parse
***  iterparse:  {http://www.w3.org/XSL/Transform/1.0}stylesheet   iterparse

***  eyquem 11:  'stylesheet'
***  eyquem 12:  'stylesheet'
***  eyquem 13:  'stylesheet'
***  eyquem 14:  'stylesheet'
***  eyquem 15:  'stylesheet'
***  eyquem 16:  'stylesheet'

0.220554691169
0.172240771802

0.0273236743636
0.0266525536625
0.0265308269626

0.0246300539733
0.0241203758299
0.0238024015203

.

.

, Aereal, , \r \n \t ; , :

def get_root_tag_from_xml_file(xml_file_path):
    with open(xml_file_path) as f:
        try:      f.seek(-200,2)
        except:   f.seek(0,0)
        finally:  xml_text_noc = re.sub('<!--.*?-->','', f.read(), flags= re.DOTALL)
        try:
            return re.search('</[^>]+>(?!.*</[^>]+>)' , xml_text_noc, re.DOTALL).group()
        except :
            return 'FAIL'

Thanks to the experience of John Machin, this solution does a more reliable job than my previous one; and, moreover, it exactly meets the requirement, as it was expressed: no parsing, therefore, a faster method, since it was implicitly aimed at.

.

John Machin, will you discover a new complex XML format function that will invalidate this solution?

0
source

Source: https://habr.com/ru/post/1795615/


All Articles