laywerrobot/gensim_w2v/xml2csv.py

134 lines
4.4 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
"""
xml2csv.py
Kailash Nadh, http://nadh.in
October 2011
License: MIT License
Documentation: http://nadh.in/code/xmlutils.py
"""
import codecs
import xml.etree.ElementTree as et
import sys
class xml2csv:
def __init__(self, input_file, output_file, encoding='utf-8'):
"""Initialize the class with the paths to the input xml file
and the output csv file
Keyword arguments:
input_file -- input xml filename
output_file -- output csv filename
encoding -- character encoding
"""
self.output_buffer = []
self.output = None
# open the xml file for iteration
self.context = et.iterparse(input_file, events=("start", "end"))
# output file handle
try:
self.output = codecs.open(output_file, "w", encoding=encoding)
except:
print("Failed to open the output file")
raise
def convert(self, tag="text", delimiter=" ", ignore=[], noheader=False,
limit=-1, buffer_size=1000, quotes=True):
"""Convert the XML file to CSV file
Keyword arguments:
tag -- the record tag. eg: item
delimiter -- csv field delimiter
ignore -- list of tags to ignore
limit -- maximum number of records to process
buffer_size -- number of records to keep in buffer before writing to disk
quotes -- insert quotes around values (e.g. "user@domain.com")
Returns:
number of records converted,
"""
# get to the root
event, root = self.context.__next__()
items = []
header_line = []
field_name = ''
processed_fields = []
tagged = False
started = False
n = 0
# iterate through the xml
for event, elem in self.context:
# if elem is an unignored child node of the record tag, it should be written to buffer
should_write = elem.tag != tag and started and elem.tag not in ignore
# and if a header is required and if there isn't one
should_tag = not tagged and should_write and not noheader
if event == 'start':
if elem.tag==tag:
processed_fields=[]
if elem.tag == tag and not started:
started = True
elif should_tag:
# if elem is nested inside a "parent", field name becomes parent_elem
field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag
else:
if should_write and elem.tag not in processed_fields:
processed_fields.append(elem.tag)
if should_tag:
header_line.append(field_name) # add field name to csv header
# remove current tag from the tag name chain
field_name = field_name.rpartition('_' + elem.tag)[0]
items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))
# end of traversing the record tag
elif elem.tag == tag and len(items) > 0:
# csv header (element tag names)
if header_line and not tagged:
self.output.write(delimiter.join(header_line) + '\n')
tagged = True
# send the csv to buffer
if quotes:
self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
else:
self.output_buffer.append((delimiter).join(items))
items = []
n += 1
# halt if the specified limit has been hit
if n == limit:
break
# flush buffer to disk
if len(self.output_buffer) > buffer_size:
self._write_buffer()
elem.clear() # discard element and recover memory
self._write_buffer() # write rest of the buffer to file
self.output.close()
return n
def _write_buffer(self):
"""Write records from buffer to the output file"""
self.output.write('\n'.join(self.output_buffer) + '\n')
self.output_buffer = []
yo = xml2csv(sys.argv[1], sys.argv[2])
yo.convert()