You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
4.4 KiB

4 years ago
  1. """
  2. xml2csv.py
  3. Kailash Nadh, http://nadh.in
  4. October 2011
  5. License: MIT License
  6. Documentation: http://nadh.in/code/xmlutils.py
  7. """
  8. import codecs
  9. import xml.etree.ElementTree as et
  10. import sys
  11. class xml2csv:
  12. def __init__(self, input_file, output_file, encoding='utf-8'):
  13. """Initialize the class with the paths to the input xml file
  14. and the output csv file
  15. Keyword arguments:
  16. input_file -- input xml filename
  17. output_file -- output csv filename
  18. encoding -- character encoding
  19. """
  20. self.output_buffer = []
  21. self.output = None
  22. # open the xml file for iteration
  23. self.context = et.iterparse(input_file, events=("start", "end"))
  24. # output file handle
  25. try:
  26. self.output = codecs.open(output_file, "w", encoding=encoding)
  27. except:
  28. print("Failed to open the output file")
  29. raise
  30. def convert(self, tag="text", delimiter=" ", ignore=[], noheader=False,
  31. limit=-1, buffer_size=1000, quotes=True):
  32. """Convert the XML file to CSV file
  33. Keyword arguments:
  34. tag -- the record tag. eg: item
  35. delimiter -- csv field delimiter
  36. ignore -- list of tags to ignore
  37. limit -- maximum number of records to process
  38. buffer_size -- number of records to keep in buffer before writing to disk
  39. quotes -- insert quotes around values (e.g. "user@domain.com")
  40. Returns:
  41. number of records converted,
  42. """
  43. # get to the root
  44. event, root = self.context.__next__()
  45. items = []
  46. header_line = []
  47. field_name = ''
  48. processed_fields = []
  49. tagged = False
  50. started = False
  51. n = 0
  52. # iterate through the xml
  53. for event, elem in self.context:
  54. # if elem is an unignored child node of the record tag, it should be written to buffer
  55. should_write = elem.tag != tag and started and elem.tag not in ignore
  56. # and if a header is required and if there isn't one
  57. should_tag = not tagged and should_write and not noheader
  58. if event == 'start':
  59. if elem.tag==tag:
  60. processed_fields=[]
  61. if elem.tag == tag and not started:
  62. started = True
  63. elif should_tag:
  64. # if elem is nested inside a "parent", field name becomes parent_elem
  65. field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag
  66. else:
  67. if should_write and elem.tag not in processed_fields:
  68. processed_fields.append(elem.tag)
  69. if should_tag:
  70. header_line.append(field_name) # add field name to csv header
  71. # remove current tag from the tag name chain
  72. field_name = field_name.rpartition('_' + elem.tag)[0]
  73. items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))
  74. # end of traversing the record tag
  75. elif elem.tag == tag and len(items) > 0:
  76. # csv header (element tag names)
  77. if header_line and not tagged:
  78. self.output.write(delimiter.join(header_line) + '\n')
  79. tagged = True
  80. # send the csv to buffer
  81. if quotes:
  82. self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
  83. else:
  84. self.output_buffer.append((delimiter).join(items))
  85. items = []
  86. n += 1
  87. # halt if the specified limit has been hit
  88. if n == limit:
  89. break
  90. # flush buffer to disk
  91. if len(self.output_buffer) > buffer_size:
  92. self._write_buffer()
  93. elem.clear() # discard element and recover memory
  94. self._write_buffer() # write rest of the buffer to file
  95. self.output.close()
  96. return n
  97. def _write_buffer(self):
  98. """Write records from buffer to the output file"""
  99. self.output.write('\n'.join(self.output_buffer) + '\n')
  100. self.output_buffer = []
  101. yo = xml2csv(sys.argv[1], sys.argv[2])
  102. yo.convert()