|
|
- import re
- import six
- from w3lib.html import replace_entities as w3lib_replace_entities
-
-
- def flatten(x):
- """flatten(sequence) -> list
- Returns a single, flat list which contains all elements retrieved
- from the sequence and all recursively contained sub-sequences
- (iterables).
- Examples:
- >>> [1, 2, [3,4], (5,6)]
- [1, 2, [3, 4], (5, 6)]
- >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
- [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
- >>> flatten(["foo", "bar"])
- ['foo', 'bar']
- >>> flatten(["foo", ["baz", 42], "bar"])
- ['foo', 'baz', 42, 'bar']
- """
- return list(iflatten(x))
-
-
- def iflatten(x):
- """iflatten(sequence) -> iterator
- Similar to ``.flatten()``, but returns iterator instead"""
- for el in x:
- if _is_listlike(el):
- for el_ in flatten(el):
- yield el_
- else:
- yield el
-
-
- def _is_listlike(x):
- """
- >>> _is_listlike("foo")
- False
- >>> _is_listlike(5)
- False
- >>> _is_listlike(b"foo")
- False
- >>> _is_listlike([b"foo"])
- True
- >>> _is_listlike((b"foo",))
- True
- >>> _is_listlike({})
- True
- >>> _is_listlike(set())
- True
- >>> _is_listlike((x for x in range(3)))
- True
- >>> _is_listlike(six.moves.xrange(5))
- True
- """
- return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
-
-
- def extract_regex(regex, text, replace_entities=True):
- """Extract a list of unicode strings from the given text/encoding using the following policies:
- * if the regex contains a named group called "extract" that will be returned
- * if the regex contains multiple numbered groups, all those will be returned (flattened)
- * if the regex doesn't contain any group the entire regex matching is returned
- """
- if isinstance(regex, six.string_types):
- regex = re.compile(regex, re.UNICODE)
-
- if 'extract' in regex.groupindex:
- # named group
- try:
- extracted = regex.search(text).group('extract')
- except AttributeError:
- strings = []
- else:
- strings = [extracted] if extracted is not None else []
- else:
- # full regex or numbered groups
- strings = regex.findall(text)
-
- strings = flatten(strings)
- if not replace_entities:
- return strings
- return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
|