alpcentaur
/
laywerrobot


								"""

								    xml2csv.py

								    Kailash Nadh, http://nadh.in

								    October 2011


								    License:        MIT License

								    Documentation:    http://nadh.in/code/xmlutils.py

								"""


								import codecs

								import xml.etree.ElementTree as et

								import sys


								class xml2csv:


								    def __init__(self, input_file, output_file, encoding='utf-8'):

								        """Initialize the class with the paths to the input xml file

								        and the output csv file

								        Keyword arguments:

								        input_file -- input xml filename

								        output_file -- output csv filename

								        encoding -- character encoding

								        """


								        self.output_buffer = []

								        self.output = None


								        # open the xml file for iteration

								        self.context = et.iterparse(input_file, events=("start", "end"))


								        # output file handle

								        try:

								            self.output = codecs.open(output_file, "w", encoding=encoding)

								        except:

								            print("Failed to open the output file")

								            raise


								    def convert(self, tag="text", delimiter=" ", ignore=[], noheader=False,

								                limit=-1, buffer_size=1000, quotes=True):


								        """Convert the XML file to CSV file

								            Keyword arguments:

								            tag -- the record tag. eg: item

								            delimiter -- csv field delimiter

								            ignore -- list of tags to ignore

								            limit -- maximum number of records to process

								            buffer_size -- number of records to keep in buffer before writing to disk

								            quotes -- insert quotes around values (e.g. "user@domain.com")

								            Returns:

								            number of records converted,

								        """


								        # get to the root

								        event, root = self.context.__next__()


								        items = []

								        header_line = []

								        field_name = ''

								        processed_fields = []


								        tagged = False

								        started = False

								        n = 0


								        # iterate through the xml

								        for event, elem in self.context:

								            # if elem is an unignored child node of the record tag, it should be written to buffer

								            should_write = elem.tag != tag and started and elem.tag not in ignore

								            # and if a header is required and if there isn't one

								            should_tag = not tagged and should_write and not noheader


								            if event == 'start':

								                if elem.tag==tag:

								                    processed_fields=[]

								                if elem.tag == tag and not started:

								                    started = True

								                elif should_tag:

								                    # if elem is nested inside a "parent", field name becomes parent_elem

								                    field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag


								            else:

								                if should_write and elem.tag not in processed_fields:

								                    processed_fields.append(elem.tag)

								                    if should_tag:

								                        header_line.append(field_name)  # add field name to csv header

								                        # remove current tag from the tag name chain

								                        field_name = field_name.rpartition('_' + elem.tag)[0]

								                    items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))


								                # end of traversing the record tag

								                elif elem.tag == tag and len(items) > 0:

								                    # csv header (element tag names)

								                    if header_line and not tagged:

								                        self.output.write(delimiter.join(header_line) + '\n')

								                    tagged = True


								                    # send the csv to buffer

								                    if quotes:

								                        self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')

								                    else:

								                        self.output_buffer.append((delimiter).join(items))

								                    items = []

								                    n += 1


								                    # halt if the specified limit has been hit

								                    if n == limit:

								                        break


								                    # flush buffer to disk

								                    if len(self.output_buffer) > buffer_size:

								                        self._write_buffer()


								                elem.clear()  # discard element and recover memory


								        self._write_buffer()  # write rest of the buffer to file

								        self.output.close()


								        return n


								    def _write_buffer(self):

								        """Write records from buffer to the output file"""


								        self.output.write('\n'.join(self.output_buffer) + '\n')

								        self.output_buffer = []


								yo = xml2csv(sys.argv[1], sys.argv[2])

								yo.convert()