You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

334 lines
12 KiB

4 years ago
  1. """The ``lxml.isoschematron`` package implements ISO Schematron support on top
  2. of the pure-xslt 'skeleton' implementation.
  3. """
  4. import sys
  5. import os.path
  6. from lxml import etree as _etree # due to validator __init__ signature
  7. # some compat stuff, borrowed from lxml.html
  8. try:
  9. unicode
  10. except NameError:
  11. # Python 3
  12. unicode = str
  13. try:
  14. basestring
  15. except NameError:
  16. # Python 3
  17. basestring = str
  18. __all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include',
  19. 'iso_abstract_expand', 'iso_svrl_for_xslt1',
  20. 'svrl_validation_errors', 'schematron_schema_valid',
  21. 'stylesheet_params', 'Schematron']
  22. # some namespaces
  23. #FIXME: Maybe lxml should provide a dedicated place for common namespace
  24. #FIXME: definitions?
  25. XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
  26. RELAXNG_NS = "http://relaxng.org/ns/structure/1.0"
  27. SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron"
  28. SVRL_NS = "http://purl.oclc.org/dsdl/svrl"
  29. # some helpers
  30. _schematron_root = '{%s}schema' % SCHEMATRON_NS
  31. _xml_schema_root = '{%s}schema' % XML_SCHEMA_NS
  32. _resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
  33. # the iso-schematron skeleton implementation steps aka xsl transformations
  34. extract_xsd = _etree.XSLT(_etree.parse(
  35. os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl')))
  36. extract_rng = _etree.XSLT(_etree.parse(
  37. os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl')))
  38. iso_dsdl_include = _etree.XSLT(_etree.parse(
  39. os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
  40. 'iso_dsdl_include.xsl')))
  41. iso_abstract_expand = _etree.XSLT(_etree.parse(
  42. os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
  43. 'iso_abstract_expand.xsl')))
  44. iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse(
  45. os.path.join(_resources_dir,
  46. 'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl')))
  47. # svrl result accessors
  48. svrl_validation_errors = _etree.XPath(
  49. '//svrl:failed-assert', namespaces={'svrl': SVRL_NS})
  50. # RelaxNG validator for schematron schemas
  51. schematron_schema_valid = _etree.RelaxNG(_etree.parse(
  52. os.path.join(_resources_dir, 'rng', 'iso-schematron.rng')))
  53. def stylesheet_params(**kwargs):
  54. """Convert keyword args to a dictionary of stylesheet parameters.
  55. XSL stylesheet parameters must be XPath expressions, i.e.:
  56. * string expressions, like "'5'"
  57. * simple (number) expressions, like "5"
  58. * valid XPath expressions, like "/a/b/text()"
  59. This function converts native Python keyword arguments to stylesheet
  60. parameters following these rules:
  61. If an arg is a string wrap it with XSLT.strparam().
  62. If an arg is an XPath object use its path string.
  63. If arg is None raise TypeError.
  64. Else convert arg to string.
  65. """
  66. result = {}
  67. for key, val in kwargs.items():
  68. if isinstance(val, basestring):
  69. val = _etree.XSLT.strparam(val)
  70. elif val is None:
  71. raise TypeError('None not allowed as a stylesheet parameter')
  72. elif not isinstance(val, _etree.XPath):
  73. val = unicode(val)
  74. result[key] = val
  75. return result
  76. # helper function for use in Schematron __init__
  77. def _stylesheet_param_dict(paramsDict, kwargsDict):
  78. """Return a copy of paramsDict, updated with kwargsDict entries, wrapped as
  79. stylesheet arguments.
  80. kwargsDict entries with a value of None are ignored.
  81. """
  82. # beware of changing mutable default arg
  83. paramsDict = dict(paramsDict)
  84. for k, v in kwargsDict.items():
  85. if v is not None: # None values do not override
  86. paramsDict[k] = v
  87. paramsDict = stylesheet_params(**paramsDict)
  88. return paramsDict
  89. class Schematron(_etree._Validator):
  90. """An ISO Schematron validator.
  91. Pass a root Element or an ElementTree to turn it into a validator.
  92. Alternatively, pass a filename as keyword argument 'file' to parse from
  93. the file system.
  94. Schematron is a less well known, but very powerful schema language.
  95. The main idea is to use the capabilities of XPath to put restrictions on
  96. the structure and the content of XML documents.
  97. The standard behaviour is to fail on ``failed-assert`` findings only
  98. (``ASSERTS_ONLY``). To change this, you can either pass a report filter
  99. function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
  100. or a custom ``XPath`` object), or subclass isoschematron.Schematron for
  101. complete control of the validation process.
  102. Built on the Schematron language 'reference' skeleton pure-xslt
  103. implementation, the validator is created as an XSLT 1.0 stylesheet using
  104. these steps:
  105. 0) (Extract from XML Schema or RelaxNG schema)
  106. 1) Process inclusions
  107. 2) Process abstract patterns
  108. 3) Compile the schematron schema to XSLT
  109. The ``include`` and ``expand`` keyword arguments can be used to switch off
  110. steps 1) and 2).
  111. To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
  112. keyword arguments ``include_params``, ``expand_params`` or
  113. ``compile_params``.
  114. For convenience, the compile-step parameter ``phase`` is also exposed as a
  115. keyword argument ``phase``. This takes precedence if the parameter is also
  116. given in the parameter dictionary.
  117. If ``store_schematron`` is set to True, the (included-and-expanded)
  118. schematron document tree is stored and available through the ``schematron``
  119. property.
  120. If ``store_xslt`` is set to True, the validation XSLT document tree will be
  121. stored and can be retrieved through the ``validator_xslt`` property.
  122. With ``store_report`` set to True (default: False), the resulting validation
  123. report document gets stored and can be accessed as the ``validation_report``
  124. property.
  125. Here is a usage example::
  126. >>> from lxml import etree
  127. >>> from lxml.isoschematron import Schematron
  128. >>> schematron = Schematron(etree.XML('''
  129. ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
  130. ... <pattern id="id_only_attribute">
  131. ... <title>id is the only permitted attribute name</title>
  132. ... <rule context="*">
  133. ... <report test="@*[not(name()='id')]">Attribute
  134. ... <name path="@*[not(name()='id')]"/> is forbidden<name/>
  135. ... </report>
  136. ... </rule>
  137. ... </pattern>
  138. ... </schema>'''),
  139. ... error_finder=Schematron.ASSERTS_AND_REPORTS)
  140. >>> xml = etree.XML('''
  141. ... <AAA name="aaa">
  142. ... <BBB id="bbb"/>
  143. ... <CCC color="ccc"/>
  144. ... </AAA>
  145. ... ''')
  146. >>> schematron.validate(xml)
  147. False
  148. >>> xml = etree.XML('''
  149. ... <AAA id="aaa">
  150. ... <BBB id="bbb"/>
  151. ... <CCC/>
  152. ... </AAA>
  153. ... ''')
  154. >>> schematron.validate(xml)
  155. True
  156. """
  157. # libxml2 error categorization for validation errors
  158. _domain = _etree.ErrorDomains.SCHEMATRONV
  159. _level = _etree.ErrorLevels.ERROR
  160. _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT
  161. # convenience definitions for common behaviours
  162. ASSERTS_ONLY = svrl_validation_errors # Default
  163. ASSERTS_AND_REPORTS = _etree.XPath(
  164. '//svrl:failed-assert | //svrl:successful-report',
  165. namespaces={'svrl': SVRL_NS})
  166. def _extract(self, element):
  167. """Extract embedded schematron schema from non-schematron host schema.
  168. This method will only be called by __init__ if the given schema document
  169. is not a schematron schema by itself.
  170. Must return a schematron schema document tree or None.
  171. """
  172. schematron = None
  173. if element.tag == _xml_schema_root:
  174. schematron = self._extract_xsd(element)
  175. elif element.nsmap[element.prefix] == RELAXNG_NS:
  176. # RelaxNG does not have a single unique root element
  177. schematron = self._extract_rng(element)
  178. return schematron
  179. # customization points
  180. # etree.XSLT objects that provide the extract, include, expand, compile
  181. # steps
  182. _extract_xsd = extract_xsd
  183. _extract_rng = extract_rng
  184. _include = iso_dsdl_include
  185. _expand = iso_abstract_expand
  186. _compile = iso_svrl_for_xslt1
  187. # etree.xpath object that determines input document validity when applied to
  188. # the svrl result report; must return a list of result elements (empty if
  189. # valid)
  190. _validation_errors = ASSERTS_ONLY
  191. def __init__(self, etree=None, file=None, include=True, expand=True,
  192. include_params={}, expand_params={}, compile_params={},
  193. store_schematron=False, store_xslt=False, store_report=False,
  194. phase=None, error_finder=ASSERTS_ONLY):
  195. super(Schematron, self).__init__()
  196. self._store_report = store_report
  197. self._schematron = None
  198. self._validator_xslt = None
  199. self._validation_report = None
  200. if error_finder is not self.ASSERTS_ONLY:
  201. self._validation_errors = error_finder
  202. # parse schema document, may be a schematron schema or an XML Schema or
  203. # a RelaxNG schema with embedded schematron rules
  204. root = None
  205. try:
  206. if etree is not None:
  207. if _etree.iselement(etree):
  208. root = etree
  209. else:
  210. root = etree.getroot()
  211. elif file is not None:
  212. root = _etree.parse(file).getroot()
  213. except Exception:
  214. raise _etree.SchematronParseError(
  215. "No tree or file given: %s" % sys.exc_info()[1])
  216. if root is None:
  217. raise ValueError("Empty tree")
  218. if root.tag == _schematron_root:
  219. schematron = root
  220. else:
  221. schematron = self._extract(root)
  222. if schematron is None:
  223. raise _etree.SchematronParseError(
  224. "Document is not a schematron schema or schematron-extractable")
  225. # perform the iso-schematron skeleton implementation steps to get a
  226. # validating xslt
  227. if include:
  228. schematron = self._include(schematron, **include_params)
  229. if expand:
  230. schematron = self._expand(schematron, **expand_params)
  231. if not schematron_schema_valid(schematron):
  232. raise _etree.SchematronParseError(
  233. "invalid schematron schema: %s" %
  234. schematron_schema_valid.error_log)
  235. if store_schematron:
  236. self._schematron = schematron
  237. # add new compile keyword args here if exposing them
  238. compile_kwargs = {'phase': phase}
  239. compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
  240. validator_xslt = self._compile(schematron, **compile_params)
  241. if store_xslt:
  242. self._validator_xslt = validator_xslt
  243. self._validator = _etree.XSLT(validator_xslt)
  244. def __call__(self, etree):
  245. """Validate doc using Schematron.
  246. Returns true if document is valid, false if not.
  247. """
  248. self._clear_error_log()
  249. result = self._validator(etree)
  250. if self._store_report:
  251. self._validation_report = result
  252. errors = self._validation_errors(result)
  253. if errors:
  254. if _etree.iselement(etree):
  255. fname = etree.getroottree().docinfo.URL or '<file>'
  256. else:
  257. fname = etree.docinfo.URL or '<file>'
  258. for error in errors:
  259. # Does svrl report the line number, anywhere? Don't think so.
  260. self._append_log_message(
  261. domain=self._domain, type=self._error_type,
  262. level=self._level, line=0,
  263. message=_etree.tostring(error, encoding='unicode'),
  264. filename=fname)
  265. return False
  266. return True
  267. @property
  268. def schematron(self):
  269. """ISO-schematron schema document (None if object has been initialized
  270. with store_schematron=False).
  271. """
  272. return self._schematron
  273. @property
  274. def validator_xslt(self):
  275. """ISO-schematron skeleton implementation XSLT validator document (None
  276. if object has been initialized with store_xslt=False).
  277. """
  278. return self._validator_xslt
  279. @property
  280. def validation_report(self):
  281. """ISO-schematron validation result report (None if result-storing has
  282. been turned off).
  283. """
  284. return self._validation_report