laywerrobot/gensim_w2v/xml2csv.py

"""
    xml2csv.py
    Kailash Nadh, http://nadh.in
    October 2011

    License:        MIT License
    Documentation:    http://nadh.in/code/xmlutils.py
"""

import codecs
import xml.etree.ElementTree as et
import sys

class xml2csv:

    def __init__(self, input_file, output_file, encoding='utf-8'):
        """Initialize the class with the paths to the input xml file
        and the output csv file
        Keyword arguments:
        input_file -- input xml filename
        output_file -- output csv filename
        encoding -- character encoding
        """

        self.output_buffer = []
        self.output = None

        # open the xml file for iteration
        self.context = et.iterparse(input_file, events=("start", "end"))

        # output file handle
        try:
            self.output = codecs.open(output_file, "w", encoding=encoding)
        except:
            print("Failed to open the output file")
            raise


    def convert(self, tag="text", delimiter=" ", ignore=[], noheader=False,
                limit=-1, buffer_size=1000, quotes=True):

        """Convert the XML file to CSV file
            Keyword arguments:
            tag -- the record tag. eg: item
            delimiter -- csv field delimiter
            ignore -- list of tags to ignore
            limit -- maximum number of records to process
            buffer_size -- number of records to keep in buffer before writing to disk
            quotes -- insert quotes around values (e.g. "user@domain.com")
            Returns:
            number of records converted,
        """

        # get to the root
        event, root = self.context.__next__()

        items = []
        header_line = []
        field_name = ''
        processed_fields = []

        tagged = False
        started = False
        n = 0


        # iterate through the xml
        for event, elem in self.context:
            # if elem is an unignored child node of the record tag, it should be written to buffer
            should_write = elem.tag != tag and started and elem.tag not in ignore
            # and if a header is required and if there isn't one
            should_tag = not tagged and should_write and not noheader

            if event == 'start':
                if elem.tag==tag:
                    processed_fields=[]
                if elem.tag == tag and not started:
                    started = True
                elif should_tag:
                    # if elem is nested inside a "parent", field name becomes parent_elem
                    field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag

            else:
                if should_write and elem.tag not in processed_fields:
                    processed_fields.append(elem.tag)
                    if should_tag:
                        header_line.append(field_name)  # add field name to csv header
                        # remove current tag from the tag name chain
                        field_name = field_name.rpartition('_' + elem.tag)[0]
                    items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))

                # end of traversing the record tag
                elif elem.tag == tag and len(items) > 0:
                    # csv header (element tag names)
                    if header_line and not tagged:
                        self.output.write(delimiter.join(header_line) + '\n')
                    tagged = True

                    # send the csv to buffer
                    if quotes:
                        self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
                    else:
                        self.output_buffer.append((delimiter).join(items))
                    items = []
                    n += 1

                    # halt if the specified limit has been hit
                    if n == limit:
                        break

                    # flush buffer to disk
                    if len(self.output_buffer) > buffer_size:
                        self._write_buffer()

                elem.clear()  # discard element and recover memory

        self._write_buffer()  # write rest of the buffer to file
        self.output.close()

        return n


    def _write_buffer(self):
        """Write records from buffer to the output file"""

        self.output.write('\n'.join(self.output_buffer) + '\n')
        self.output_buffer = []


yo = xml2csv(sys.argv[1], sys.argv[2])
yo.convert()