How to use sax xml parser to read and write large xml?

I am trying to remove all project1 nodes (along with my children) from the sample XML document below (source document is about 30 GB) using the SAX parser. It would be great to have a separate modified file or ok with built-in editing.

sample.xml

 <ROOT> <test src="http://dfs.com">Hi</test> <project1>This is old data<foo></foo></project1> <bar> <project1>ty</project1> <foo></foo> </bar> </ROOT> 

Here is my attempt.

parser.py

 from xml.sax.handler import ContentHandler import xml.sax class MyHandler(xml.sax.handler.ContentHandler): def __init__(self, out_file): self._charBuffer = [] self._result = [] self._out = open(out_file, 'w') def _createElement(self, name, attrs): attributes = attrs.items() if attributes: out = '' for key, value in attributes: out += ' {}={}'.format(key, value) return '<{}{}>'.format(name, out) return '<{}>'.format(name) def _getCharacterData(self): data = ''.join(self._charBuffer).strip() self._charBuffer = [] self._out.write(data.strip()) #remove strip() if whitespace is important def parse(self, f): xml.sax.parse(f, self) def characters(self, data): self._charBuffer.append(data) def startElement(self, name, attrs): if not name == 'project1': self._result.append({}) self._out.write(self._createElement(name, attrs)) def endElement(self, name): if not name == 'project1': self._result[-1][name] = self._getCharacterData() MyHandler('out.xml').parse("sample.xml") 

I can not make it work.

+5
source share
1 answer

You can use xml.sax.saxutils.XMLFilterBase to filter your project1 nodes.

Instead of collecting xml strings yourself, you can use xml.sax.saxutils.XMLGenerator .

Below is the Python3 code, configure super if you need Python2.

 from xml.sax import make_parser from xml.sax.saxutils import XMLFilterBase, XMLGenerator class Project1Filter(XMLFilterBase): """This decides which SAX events to forward to the ContentHandler We will not forward events when we are inside any elements with a name specified in the 'tags_names_to_exclude' parameter """ def __init__(self, tag_names_to_exclude, parent=None): super().__init__(parent) # set of tag names to exclude self._tag_names_to_exclude = tag_names_to_exclude # _project_1_count keeps track of opened project1 elements self._project_1_count = 0 def _forward_events(self): # will return True when we are not inside a project1 element return self._project_1_count == 0 def startElement(self, name, attrs): if name in self._tag_names_to_exclude: self._project_1_count += 1 if self._forward_events(): super().startElement(name, attrs) def endElement(self, name): if self._forward_events(): super().endElement(name) if name in self._tag_names_to_exclude: self._project_1_count -= 1 def characters(self, content): if self._forward_events(): super().characters(content) # override other content handler methods on XMLFilterBase as neccessary def main(): tag_names_to_exclude = {'project1', 'project2', 'project3'} reader = Project1Filter(tag_names_to_exclude, make_parser()) with open('out-small.xml', 'w') as f: handler = XMLGenerator(f) reader.setContentHandler(handler) reader.parse('input.xml') if __name__ == "__main__": main() 
+3
source

Source: https://habr.com/ru/post/1264413/


All Articles