, : . , , .
, .
, , , , , . .
, -, XML W3c, XML-. , . , XML debonair.
, , , , XML . , W3c, \n
, \r
\t
XML- , .
, - : , . , \t
, . , XML , : , .
, , . , XML, , , . , XML: \n
XML-.
.
, , whac_moles, , .
, , .
, : , XML- . , , , . XML-: XML, . , OP, .
.
, - , , , .
, :
~~ XML
~~ -200
~~ 200
~~ , :
, iterparse. . - .
import xml.etree.cElementTree as et
from cStringIO import StringIO
import re,urllib
xml5 = """\
<?xml version="1.0" ?>
<!-- this is a comment -->
<root\t
\r\t\r \r
><foo
>bar</foo\t \r></root
>
"""
xml6 = """\
<?xml version="1.0" ?>
<!-- this is a comment -->
<root
><foo
>bar</foo\n\t \t></root \t
\r>
<!-- \r \t
That all, folks!
\t-->
"""
xml7 = '''<?xml version="1.0" ?>
<!-- <mole1> -->
<root><foo
\t\t\r\r\t/></root \t
>
<!-- </mole2>\t \r
\r-->
<!---->
'''
xml8 = '''<?xml version="1.0" ?><!-- \r<mole1> --><root> \t\t<foo \t\r\r/></root>\t<!-- </mole2> -->'''
sock = urllib.urlopen('http://www.cafeconleche.org/books/bible/examples/18/18-4.xsl')
xml9 = sock.read()
sock.close()
def rp(x):
return '\\r' if x.group()=='\r' else '\\t'
for xml_text in (xml5, xml6, xml7, xml8, xml9):
print '\\n\n'.join(re.sub('\r|\t',rp,xml_text).split('\n'))
print '-----------------------------'
xml_text_noc = re.sub('<!--.*?-->|[\n\r\t]','', xml_text,flags=re.DOTALL)
RE11 = '(?<=</)[^ >]+(?= *>)(?!.*</[^>]+>)'
m = re.search(RE11, xml_text_noc,re.DOTALL)
print "*** eyquem 11: " + repr(m.group() if m else "FAIL")
xml_text_noc = re.sub('<!--.*?-->|[\n\r\t]','', xml_text,flags=re.DOTALL)
RE12 = '</([^ >]+) *>(?!.*</[^>]+>)'
m = re.search(RE12, xml_text_noc,re.DOTALL)
print "*** eyquem 12: " + repr(m.group(1) if m else "FAIL")
xml_text_noc = re.sub('<!--.*?-->|[\n\r\t]','', xml_text,flags=re.DOTALL)
RE13 = '</[^ >]+ *>(?!.*</[^>]+>)'
m = re.search(RE13, xml_text_noc,re.DOTALL)
print "*** eyquem 13: " + repr(m.group()[2:-1].rstrip() if m else "FAIL")
xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
RE14 = '(?<=</)[^ \n\r\t>]+(?=[ \n\r\t]*>)(?!.*</[^>]+>)'
m = re.search(RE14, xml_text_noc,re.DOTALL)
print "*** eyquem 14: " + repr(m.group() if m else "FAIL")
xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
RE15 = '</([^ \n\r\t>]+)[ \n\r\t]*>(?!.*</[^>]+>)'
m = re.search(RE15, xml_text_noc,re.DOTALL)
print "*** eyquem 15: " + repr(m.group(1).rstrip() if m else "FAIL")
xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
RE16 = '</[^ \n\r\t>]+[ \n\r\t]*>(?!.*</[^>]+>)'
m = re.search(RE16, xml_text_noc,re.DOTALL)
print "*** eyquem 16: " + repr(m.group()[2:-1].rstrip() if m else "FAIL")
print
filelike_obj = StringIO(xml_text)
tree = et.parse(filelike_obj)
print "*** parse: " + tree.getroot().tag
filelike_obj = StringIO(xml_text)
for event, elem in et.iterparse(filelike_obj, ('start', 'end')):
print "*** iterparse: " + elem.tag
break
print '\n============================================='
<?xml version="1.0" ?> \n
\n
<root\t\n
\r\t\r \r\n
><foo\n
\n
>bar</foo\t \r></root\n
>\n
-----------------------------
*** eyquem 11: 'root'
*** eyquem 12: 'root'
*** eyquem 13: 'root'
*** eyquem 14: 'root'
*** eyquem 15: 'root'
*** eyquem 16: 'root'
*** parse: root
*** iterparse: root
=============================================
<?xml version="1.0" ?> \n
\n
<root\n
><foo\n
>bar</foo\n
\t \t></root \t\n
\r>\n
\n
-----------------------------
*** eyquem 11: 'root'
*** eyquem 12: 'root'
*** eyquem 13: 'root'
*** eyquem 14: 'root'
*** eyquem 15: 'root'
*** eyquem 16: 'root'
*** parse: root
*** iterparse: root
=============================================
<?xml version="1.0" ?>\n
\n
<root><foo\n
\n
\t\t\r\r\t/></root \t\n
> \n
\n
\n
-----------------------------
*** eyquem 11: 'root'
*** eyquem 12: 'root'
*** eyquem 13: 'root'
*** eyquem 14: 'root'
*** eyquem 15: 'root'
*** eyquem 16: 'root'
*** parse: root
*** iterparse: root
=============================================
<?xml version="1.0" ?><root> \t\t<foo \t\r\r/></root>\t
-----------------------------
*** eyquem 11: 'root'
*** eyquem 12: 'root'
*** eyquem 13: 'root'
*** eyquem 14: 'root'
*** eyquem 15: 'root'
*** eyquem 16: 'root'
*** parse: root
*** iterparse: root
=============================================
<?xml version="1.0"?>\r\n
<stylesheet\r\n
xmlns="http://www.w3.org/XSL/Transform/1.0"\r\n
xmlns:fo="http://www.w3.org/XSL/Format/1.0"\r\n
result-ns="fo">\r\n
\r\n
<template match="/">\r\n
<fo:root xmlns:fo="http://www.w3.org/XSL/Format/1.0">\r\n
\r\n
<fo:layout-master-set>\r\n
<fo:simple-page-master page-master-name="only">\r\n
<fo:region-body/>\r\n
</fo:simple-page-master>\r\n
</fo:layout-master-set>\r\n
\r\n
<fo:page-sequence>\r\n
\r\n
<fo:sequence-specification>\r\n
<fo:sequence-specifier-single page-master-name="only"/>\r\n
</fo:sequence-specification>\r\n
\r\n
<fo:flow>\r\n
<apply-templates select="//ATOM"/>\r\n
</fo:flow>\r\n
\r\n
</fo:page-sequence>\r\n
\r\n
</fo:root>\r\n
</template>\r\n
\r\n
<template match="ATOM">\r\n
<fo:block font-size="20pt" font-family="serif">\r\n
<value-of select="NAME"/>\r\n
</fo:block>\r\n
</template>\r\n
\r\n
</stylesheet>\r\n
-----------------------------
*** eyquem 11: 'stylesheet'
*** eyquem 12: 'stylesheet'
*** eyquem 13: 'stylesheet'
*** eyquem 14: 'stylesheet'
*** eyquem 15: 'stylesheet'
*** eyquem 16: 'stylesheet'
*** parse: {http://www.w3.org/XSL/Transform/1.0}stylesheet
*** iterparse: {http://www.w3.org/XSL/Transform/1.0}stylesheet
=============================================
:
import xml.etree.cElementTree as et
from cStringIO import StringIO
import re
import urllib
from time import clock
sock = urllib.urlopen('http://www.cafeconleche.org/books/bible/examples/18/18-4.xsl')
ch = sock.read()
sock.close()
li = ch.splitlines(True)[0:6] + 30*ch.splitlines(True)[6:-2] + ch.splitlines(True)[-2:]
with open('xml_example.xml','w') as f:
f.write(''.join(li))
print 'length of XML text in a file : ',len(''.join(li)),'\n'
P,I,A,B,C,D,E,F = [],[],[],[],[],[],[],[],
n = 50
for cnt in xrange(50):
te = clock()
for i in xrange (n):
with open('xml_example.xml') as filelike_obj:
tree = et.parse(filelike_obj)
res_parse = tree.getroot().tag
P.append( clock()-te)
te = clock()
for i in xrange (n):
with open('xml_example.xml') as filelike_obj:
for event, elem in et.iterparse(filelike_obj, ('start', 'end')):
res_iterparse = elem.tag
break
I.append( clock()-te)
RE11 = '(?<=</)[^ >]+(?= *>)(?!.*</[^>]+>)'
te = clock()
for i in xrange (n):
with open('xml_example.xml') as f:
f.seek(-200,2)
xml_text = f.read()
xml_text_noc = re.sub('(<!--.*?-->|[\n\r\t])','', xml_text,flags=re.DOTALL)
m = re.search(RE11, xml_text_noc,re.DOTALL)
res_eyq11 = m.group() if m else "FAIL"
A.append( clock()-te)
RE12 = '</([^ >]+) *>(?!.*</[^>]+>)'
te = clock()
for i in xrange (n):
with open('xml_example.xml') as f:
f.seek(-200,2)
xml_text = f.read()
xml_text_noc = re.sub('(<!--.*?-->|[\n\r\t])','', xml_text,flags=re.DOTALL)
m = re.search(RE12, xml_text_noc,re.DOTALL)
res_eyq12 = m.group(1) if m else "FAIL"
B.append( clock()-te)
RE13 = '</[^ >]+ *>(?!.*</[^>]+>)'
te = clock()
for i in xrange (n):
with open('xml_example.xml') as f:
f.seek(-200,2)
xml_text = f.read()
xml_text_noc = re.sub('(<!--.*?-->|[\n\r\t])','', xml_text,flags=re.DOTALL)
m = re.search(RE13, xml_text_noc,re.DOTALL)
res_eyq13 = m.group()[2:-1] if m else "FAIL"
C.append( clock()-te)
RE14 = '(?<=</)[^ \n\r\t>]+(?=[ \n\r\t]*>)(?!.*</[^>]+>)'
te = clock()
for i in xrange (n):
with open('xml_example.xml') as f:
f.seek(-200,2)
xml_text = f.read()
xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
m = re.search(RE14, xml_text_noc,re.DOTALL)
res_eyq14 = m.group() if m else "FAIL"
D.append( clock()-te)
RE15 = '</([^ \n\r\t>]+)[ \n\r\t]*>(?!.*</[^>]+>)'
te = clock()
for i in xrange (n):
with open('xml_example.xml') as f:
f.seek(-200,2)
xml_text = f.read()
xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
m = re.search(RE15, xml_text_noc,re.DOTALL)
res_eyq15 = m.group(1) if m else "FAIL"
E.append( clock()-te)
RE16 = '</[^ \n\r\t>]+[ \n\r\t]*>(?!.*</[^>]+>)'
te = clock()
for i in xrange (n):
with open('xml_example.xml') as f:
f.seek(-200,2)
xml_text = f.read()
xml_text_noc = re.sub('<!--.*?-->','', xml_text,flags=re.DOTALL)
m = re.search(RE16, xml_text_noc,re.DOTALL)
res_eyq16 = m.group()[2:-1].rstrip() if m else "FAIL"
F.append( clock()-te)
print "*** parse: " + res_parse, ' parse'
print "*** iterparse: " + res_iterparse, ' iterparse'
print
print "*** eyquem 11: " + repr(res_eyq11)
print "*** eyquem 12: " + repr(res_eyq12)
print "*** eyquem 13: " + repr(res_eyq13)
print "*** eyquem 14: " + repr(res_eyq14)
print "*** eyquem 15: " + repr(res_eyq15)
print "*** eyquem 16: " + repr(res_eyq16)
print
print str(min(P))
print str(min(I))
print
print '\n'.join(str(u) for u in map(min,(A,B,C)))
print
print '\n'.join(str(u) for u in map(min,(D,E,F)))
:
length of XML text in a file : 22548
*** parse: {http:
*** iterparse: {http:
*** eyquem 11: 'stylesheet'
*** eyquem 12: 'stylesheet'
*** eyquem 13: 'stylesheet'
*** eyquem 14: 'stylesheet'
*** eyquem 15: 'stylesheet'
*** eyquem 16: 'stylesheet'
0.220554691169
0.172240771802
0.0273236743636
0.0266525536625
0.0265308269626
0.0246300539733
0.0241203758299
0.0238024015203
.
.
, Aereal, , \r
\n
\t
; , :
def get_root_tag_from_xml_file(xml_file_path):
with open(xml_file_path) as f:
try: f.seek(-200,2)
except: f.seek(0,0)
finally: xml_text_noc = re.sub('<!--.*?-->','', f.read(), flags= re.DOTALL)
try:
return re.search('</[^>]+>(?!.*</[^>]+>)' , xml_text_noc, re.DOTALL).group()
except :
return 'FAIL'
Thanks to the experience of John Machin, this solution does a more reliable job than my previous one; and, moreover, it exactly meets the requirement, as it was expressed: no parsing, therefore, a faster method, since it was implicitly aimed at.
.
John Machin, will you discover a new complex XML format function that will invalidate this solution?