You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

82 lines
2.4 KiB

4 years ago
  1. import re
  2. import six
  3. from w3lib.html import replace_entities as w3lib_replace_entities
  4. def flatten(x):
  5. """flatten(sequence) -> list
  6. Returns a single, flat list which contains all elements retrieved
  7. from the sequence and all recursively contained sub-sequences
  8. (iterables).
  9. Examples:
  10. >>> [1, 2, [3,4], (5,6)]
  11. [1, 2, [3, 4], (5, 6)]
  12. >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
  13. [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
  14. >>> flatten(["foo", "bar"])
  15. ['foo', 'bar']
  16. >>> flatten(["foo", ["baz", 42], "bar"])
  17. ['foo', 'baz', 42, 'bar']
  18. """
  19. return list(iflatten(x))
  20. def iflatten(x):
  21. """iflatten(sequence) -> iterator
  22. Similar to ``.flatten()``, but returns iterator instead"""
  23. for el in x:
  24. if _is_listlike(el):
  25. for el_ in flatten(el):
  26. yield el_
  27. else:
  28. yield el
  29. def _is_listlike(x):
  30. """
  31. >>> _is_listlike("foo")
  32. False
  33. >>> _is_listlike(5)
  34. False
  35. >>> _is_listlike(b"foo")
  36. False
  37. >>> _is_listlike([b"foo"])
  38. True
  39. >>> _is_listlike((b"foo",))
  40. True
  41. >>> _is_listlike({})
  42. True
  43. >>> _is_listlike(set())
  44. True
  45. >>> _is_listlike((x for x in range(3)))
  46. True
  47. >>> _is_listlike(six.moves.xrange(5))
  48. True
  49. """
  50. return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
  51. def extract_regex(regex, text, replace_entities=True):
  52. """Extract a list of unicode strings from the given text/encoding using the following policies:
  53. * if the regex contains a named group called "extract" that will be returned
  54. * if the regex contains multiple numbered groups, all those will be returned (flattened)
  55. * if the regex doesn't contain any group the entire regex matching is returned
  56. """
  57. if isinstance(regex, six.string_types):
  58. regex = re.compile(regex, re.UNICODE)
  59. if 'extract' in regex.groupindex:
  60. # named group
  61. try:
  62. extracted = regex.search(text).group('extract')
  63. except AttributeError:
  64. strings = []
  65. else:
  66. strings = [extracted] if extracted is not None else []
  67. else:
  68. # full regex or numbered groups
  69. strings = regex.findall(text)
  70. strings = flatten(strings)
  71. if not replace_entities:
  72. return strings
  73. return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]