import re import six from w3lib.html import replace_entities as w3lib_replace_entities def flatten(x): """flatten(sequence) -> list Returns a single, flat list which contains all elements retrieved from the sequence and all recursively contained sub-sequences (iterables). Examples: >>> [1, 2, [3,4], (5,6)] [1, 2, [3, 4], (5, 6)] >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)]) [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10] >>> flatten(["foo", "bar"]) ['foo', 'bar'] >>> flatten(["foo", ["baz", 42], "bar"]) ['foo', 'baz', 42, 'bar'] """ return list(iflatten(x)) def iflatten(x): """iflatten(sequence) -> iterator Similar to ``.flatten()``, but returns iterator instead""" for el in x: if _is_listlike(el): for el_ in flatten(el): yield el_ else: yield el def _is_listlike(x): """ >>> _is_listlike("foo") False >>> _is_listlike(5) False >>> _is_listlike(b"foo") False >>> _is_listlike([b"foo"]) True >>> _is_listlike((b"foo",)) True >>> _is_listlike({}) True >>> _is_listlike(set()) True >>> _is_listlike((x for x in range(3))) True >>> _is_listlike(six.moves.xrange(5)) True """ return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes)) def extract_regex(regex, text, replace_entities=True): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) if 'extract' in regex.groupindex: # named group try: extracted = regex.search(text).group('extract') except AttributeError: strings = [] else: strings = [extracted] if extracted is not None else [] else: # full regex or numbered groups strings = regex.findall(text) strings = flatten(strings) if not replace_entities: return strings return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]