alpcentaur
/
laywerrobot

"""
    xml2csv.py    Kailash Nadh, http://nadh.in    October 2011        License:        MIT License    Documentation:    http://nadh.in/code/xmlutils.py"""

import codecsimport xml.etree.ElementTree as etimport sys
class xml2csv:
    def __init__(self, input_file, output_file, encoding='utf-8'):        """Initialize the class with the paths to the input xml file
        and the output csv file        Keyword arguments:        input_file -- input xml filename        output_file -- output csv filename        encoding -- character encoding        """

        self.output_buffer = []        self.output = None
        # open the xml file for iteration        self.context = et.iterparse(input_file, events=("start", "end"))
        # output file handle        try:            self.output = codecs.open(output_file, "w", encoding=encoding)        except:            print("Failed to open the output file")            raise

    def convert(self, tag="text", delimiter=" ", ignore=[], noheader=False,                limit=-1, buffer_size=1000, quotes=True):
        """Convert the XML file to CSV file
            Keyword arguments:            tag -- the record tag. eg: item            delimiter -- csv field delimiter            ignore -- list of tags to ignore            limit -- maximum number of records to process            buffer_size -- number of records to keep in buffer before writing to disk            quotes -- insert quotes around values (e.g. "user@domain.com")            Returns:            number of records converted,        """

        # get to the root        event, root = self.context.__next__()
        items = []        header_line = []        field_name = ''        processed_fields = []
        tagged = False        started = False        n = 0

        # iterate through the xml        for event, elem in self.context:            # if elem is an unignored child node of the record tag, it should be written to buffer            should_write = elem.tag != tag and started and elem.tag not in ignore            # and if a header is required and if there isn't one            should_tag = not tagged and should_write and not noheader
            if event == 'start':                if elem.tag==tag:                    processed_fields=[]                if elem.tag == tag and not started:                    started = True                elif should_tag:                    # if elem is nested inside a "parent", field name becomes parent_elem                    field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag
            else:                if should_write and elem.tag not in processed_fields:                    processed_fields.append(elem.tag)                    if should_tag:                        header_line.append(field_name)  # add field name to csv header                        # remove current tag from the tag name chain                        field_name = field_name.rpartition('_' + elem.tag)[0]                    items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))
                # end of traversing the record tag                elif elem.tag == tag and len(items) > 0:                    # csv header (element tag names)                    if header_line and not tagged:                        self.output.write(delimiter.join(header_line) + '\n')                    tagged = True
                    # send the csv to buffer                    if quotes:                        self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')                    else:                        self.output_buffer.append((delimiter).join(items))                    items = []                    n += 1
                    # halt if the specified limit has been hit                    if n == limit:                        break
                    # flush buffer to disk                    if len(self.output_buffer) > buffer_size:                        self._write_buffer()
                elem.clear()  # discard element and recover memory
        self._write_buffer()  # write rest of the buffer to file        self.output.close()
        return n

    def _write_buffer(self):        """Write records from buffer to the output file"""
        self.output.write('\n'.join(self.output_buffer) + '\n')        self.output_buffer = []    
yo = xml2csv(sys.argv[1], sys.argv[2])yo.convert()