|
|
- """
- xml2csv.py
- Kailash Nadh, http://nadh.in
- October 2011
-
- License: MIT License
- Documentation: http://nadh.in/code/xmlutils.py
- """
-
- import codecs
- import xml.etree.ElementTree as et
- import sys
-
- class xml2csv:
-
- def __init__(self, input_file, output_file, encoding='utf-8'):
- """Initialize the class with the paths to the input xml file
- and the output csv file
- Keyword arguments:
- input_file -- input xml filename
- output_file -- output csv filename
- encoding -- character encoding
- """
-
- self.output_buffer = []
- self.output = None
-
- # open the xml file for iteration
- self.context = et.iterparse(input_file, events=("start", "end"))
-
- # output file handle
- try:
- self.output = codecs.open(output_file, "w", encoding=encoding)
- except:
- print("Failed to open the output file")
- raise
-
-
- def convert(self, tag="text", delimiter=" ", ignore=[], noheader=False,
- limit=-1, buffer_size=1000, quotes=True):
-
- """Convert the XML file to CSV file
- Keyword arguments:
- tag -- the record tag. eg: item
- delimiter -- csv field delimiter
- ignore -- list of tags to ignore
- limit -- maximum number of records to process
- buffer_size -- number of records to keep in buffer before writing to disk
- quotes -- insert quotes around values (e.g. "user@domain.com")
- Returns:
- number of records converted,
- """
-
- # get to the root
- event, root = self.context.__next__()
-
- items = []
- header_line = []
- field_name = ''
- processed_fields = []
-
- tagged = False
- started = False
- n = 0
-
-
- # iterate through the xml
- for event, elem in self.context:
- # if elem is an unignored child node of the record tag, it should be written to buffer
- should_write = elem.tag != tag and started and elem.tag not in ignore
- # and if a header is required and if there isn't one
- should_tag = not tagged and should_write and not noheader
-
- if event == 'start':
- if elem.tag==tag:
- processed_fields=[]
- if elem.tag == tag and not started:
- started = True
- elif should_tag:
- # if elem is nested inside a "parent", field name becomes parent_elem
- field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag
-
- else:
- if should_write and elem.tag not in processed_fields:
- processed_fields.append(elem.tag)
- if should_tag:
- header_line.append(field_name) # add field name to csv header
- # remove current tag from the tag name chain
- field_name = field_name.rpartition('_' + elem.tag)[0]
- items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))
-
- # end of traversing the record tag
- elif elem.tag == tag and len(items) > 0:
- # csv header (element tag names)
- if header_line and not tagged:
- self.output.write(delimiter.join(header_line) + '\n')
- tagged = True
-
- # send the csv to buffer
- if quotes:
- self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
- else:
- self.output_buffer.append((delimiter).join(items))
- items = []
- n += 1
-
- # halt if the specified limit has been hit
- if n == limit:
- break
-
- # flush buffer to disk
- if len(self.output_buffer) > buffer_size:
- self._write_buffer()
-
- elem.clear() # discard element and recover memory
-
- self._write_buffer() # write rest of the buffer to file
- self.output.close()
-
- return n
-
-
- def _write_buffer(self):
- """Write records from buffer to the output file"""
-
- self.output.write('\n'.join(self.output_buffer) + '\n')
- self.output_buffer = []
-
-
- yo = xml2csv(sys.argv[1], sys.argv[2])
- yo.convert()
-
-
|