You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

83 lines
2.4 KiB

import re
import six
from w3lib.html import replace_entities as w3lib_replace_entities
def flatten(x):
"""flatten(sequence) -> list
Returns a single, flat list which contains all elements retrieved
from the sequence and all recursively contained sub-sequences
(iterables).
Examples:
>>> [1, 2, [3,4], (5,6)]
[1, 2, [3, 4], (5, 6)]
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
[1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
>>> flatten(["foo", "bar"])
['foo', 'bar']
>>> flatten(["foo", ["baz", 42], "bar"])
['foo', 'baz', 42, 'bar']
"""
return list(iflatten(x))
def iflatten(x):
"""iflatten(sequence) -> iterator
Similar to ``.flatten()``, but returns iterator instead"""
for el in x:
if _is_listlike(el):
for el_ in flatten(el):
yield el_
else:
yield el
def _is_listlike(x):
"""
>>> _is_listlike("foo")
False
>>> _is_listlike(5)
False
>>> _is_listlike(b"foo")
False
>>> _is_listlike([b"foo"])
True
>>> _is_listlike((b"foo",))
True
>>> _is_listlike({})
True
>>> _is_listlike(set())
True
>>> _is_listlike((x for x in range(3)))
True
>>> _is_listlike(six.moves.xrange(5))
True
"""
return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
def extract_regex(regex, text, replace_entities=True):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)
if 'extract' in regex.groupindex:
# named group
try:
extracted = regex.search(text).group('extract')
except AttributeError:
strings = []
else:
strings = [extracted] if extracted is not None else []
else:
# full regex or numbered groups
strings = regex.findall(text)
strings = flatten(strings)
if not replace_entities:
return strings
return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]