You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1885 lines
70 KiB

4 years ago
  1. # Use of this source code is governed by a BSD-style license that can be
  2. # found in the LICENSE file.
  3. __license__ = "MIT"
  4. try:
  5. from collections.abc import Callable # Python 3.6
  6. except ImportError as e:
  7. from collections import Callable
  8. import re
  9. import shlex
  10. import sys
  11. import warnings
  12. from bs4.dammit import EntitySubstitution
  13. DEFAULT_OUTPUT_ENCODING = "utf-8"
  14. PY3K = (sys.version_info[0] > 2)
  15. whitespace_re = re.compile(r"\s+")
  16. def _alias(attr):
  17. """Alias one attribute name to another for backward compatibility"""
  18. @property
  19. def alias(self):
  20. return getattr(self, attr)
  21. @alias.setter
  22. def alias(self):
  23. return setattr(self, attr)
  24. return alias
  25. class NamespacedAttribute(str):
  26. def __new__(cls, prefix, name, namespace=None):
  27. if name is None:
  28. obj = str.__new__(cls, prefix)
  29. elif prefix is None:
  30. # Not really namespaced.
  31. obj = str.__new__(cls, name)
  32. else:
  33. obj = str.__new__(cls, prefix + ":" + name)
  34. obj.prefix = prefix
  35. obj.name = name
  36. obj.namespace = namespace
  37. return obj
  38. class AttributeValueWithCharsetSubstitution(str):
  39. """A stand-in object for a character encoding specified in HTML."""
  40. class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
  41. """A generic stand-in for the value of a meta tag's 'charset' attribute.
  42. When Beautiful Soup parses the markup '<meta charset="utf8">', the
  43. value of the 'charset' attribute will be one of these objects.
  44. """
  45. def __new__(cls, original_value):
  46. obj = str.__new__(cls, original_value)
  47. obj.original_value = original_value
  48. return obj
  49. def encode(self, encoding):
  50. return encoding
  51. class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
  52. """A generic stand-in for the value of a meta tag's 'content' attribute.
  53. When Beautiful Soup parses the markup:
  54. <meta http-equiv="content-type" content="text/html; charset=utf8">
  55. The value of the 'content' attribute will be one of these objects.
  56. """
  57. CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
  58. def __new__(cls, original_value):
  59. match = cls.CHARSET_RE.search(original_value)
  60. if match is None:
  61. # No substitution necessary.
  62. return str.__new__(str, original_value)
  63. obj = str.__new__(cls, original_value)
  64. obj.original_value = original_value
  65. return obj
  66. def encode(self, encoding):
  67. def rewrite(match):
  68. return match.group(1) + encoding
  69. return self.CHARSET_RE.sub(rewrite, self.original_value)
  70. class HTMLAwareEntitySubstitution(EntitySubstitution):
  71. """Entity substitution rules that are aware of some HTML quirks.
  72. Specifically, the contents of <script> and <style> tags should not
  73. undergo entity substitution.
  74. Incoming NavigableString objects are checked to see if they're the
  75. direct children of a <script> or <style> tag.
  76. """
  77. cdata_containing_tags = set(["script", "style"])
  78. preformatted_tags = set(["pre"])
  79. preserve_whitespace_tags = set(['pre', 'textarea'])
  80. @classmethod
  81. def _substitute_if_appropriate(cls, ns, f):
  82. if (isinstance(ns, NavigableString)
  83. and ns.parent is not None
  84. and ns.parent.name in cls.cdata_containing_tags):
  85. # Do nothing.
  86. return ns
  87. # Substitute.
  88. return f(ns)
  89. @classmethod
  90. def substitute_html(cls, ns):
  91. return cls._substitute_if_appropriate(
  92. ns, EntitySubstitution.substitute_html)
  93. @classmethod
  94. def substitute_xml(cls, ns):
  95. return cls._substitute_if_appropriate(
  96. ns, EntitySubstitution.substitute_xml)
  97. class Formatter(object):
  98. """Contains information about how to format a parse tree."""
  99. # By default, represent void elements as <tag/> rather than <tag>
  100. void_element_close_prefix = '/'
  101. def substitute_entities(self, *args, **kwargs):
  102. """Transform certain characters into named entities."""
  103. raise NotImplementedError()
  104. class HTMLFormatter(Formatter):
  105. """The default HTML formatter."""
  106. def substitute(self, *args, **kwargs):
  107. return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
  108. class MinimalHTMLFormatter(Formatter):
  109. """A minimal HTML formatter."""
  110. def substitute(self, *args, **kwargs):
  111. return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
  112. class HTML5Formatter(HTMLFormatter):
  113. """An HTML formatter that omits the slash in a void tag."""
  114. void_element_close_prefix = None
  115. class XMLFormatter(Formatter):
  116. """Substitute only the essential XML entities."""
  117. def substitute(self, *args, **kwargs):
  118. return EntitySubstitution.substitute_xml(*args, **kwargs)
  119. class HTMLXMLFormatter(Formatter):
  120. """Format XML using HTML rules."""
  121. def substitute(self, *args, **kwargs):
  122. return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
  123. class PageElement(object):
  124. """Contains the navigational information for some part of the page
  125. (either a tag or a piece of text)"""
  126. # There are five possible values for the "formatter" argument passed in
  127. # to methods like encode() and prettify():
  128. #
  129. # "html" - All Unicode characters with corresponding HTML entities
  130. # are converted to those entities on output.
  131. # "html5" - The same as "html", but empty void tags are represented as
  132. # <tag> rather than <tag/>
  133. # "minimal" - Bare ampersands and angle brackets are converted to
  134. # XML entities: &amp; &lt; &gt;
  135. # None - The null formatter. Unicode characters are never
  136. # converted to entities. This is not recommended, but it's
  137. # faster than "minimal".
  138. # A callable function - it will be called on every string that needs to undergo entity substitution.
  139. # A Formatter instance - Formatter.substitute(string) will be called on every string that
  140. # needs to undergo entity substitution.
  141. #
  142. # In an HTML document, the default "html", "html5", and "minimal"
  143. # functions will leave the contents of <script> and <style> tags
  144. # alone. For an XML document, all tags will be given the same
  145. # treatment.
  146. HTML_FORMATTERS = {
  147. "html" : HTMLFormatter(),
  148. "html5" : HTML5Formatter(),
  149. "minimal" : MinimalHTMLFormatter(),
  150. None : None
  151. }
  152. XML_FORMATTERS = {
  153. "html" : HTMLXMLFormatter(),
  154. "minimal" : XMLFormatter(),
  155. None : None
  156. }
  157. def format_string(self, s, formatter='minimal'):
  158. """Format the given string using the given formatter."""
  159. if isinstance(formatter, str):
  160. formatter = self._formatter_for_name(formatter)
  161. if formatter is None:
  162. output = s
  163. else:
  164. if callable(formatter):
  165. # Backwards compatibility -- you used to pass in a formatting method.
  166. output = formatter(s)
  167. else:
  168. output = formatter.substitute(s)
  169. return output
  170. @property
  171. def _is_xml(self):
  172. """Is this element part of an XML tree or an HTML tree?
  173. This is used when mapping a formatter name ("minimal") to an
  174. appropriate function (one that performs entity-substitution on
  175. the contents of <script> and <style> tags, or not). It can be
  176. inefficient, but it should be called very rarely.
  177. """
  178. if self.known_xml is not None:
  179. # Most of the time we will have determined this when the
  180. # document is parsed.
  181. return self.known_xml
  182. # Otherwise, it's likely that this element was created by
  183. # direct invocation of the constructor from within the user's
  184. # Python code.
  185. if self.parent is None:
  186. # This is the top-level object. It should have .known_xml set
  187. # from tree creation. If not, take a guess--BS is usually
  188. # used on HTML markup.
  189. return getattr(self, 'is_xml', False)
  190. return self.parent._is_xml
  191. def _formatter_for_name(self, name):
  192. "Look up a formatter function based on its name and the tree."
  193. if self._is_xml:
  194. return self.XML_FORMATTERS.get(name, XMLFormatter())
  195. else:
  196. return self.HTML_FORMATTERS.get(name, HTMLFormatter())
  197. def setup(self, parent=None, previous_element=None, next_element=None,
  198. previous_sibling=None, next_sibling=None):
  199. """Sets up the initial relations between this element and
  200. other elements."""
  201. self.parent = parent
  202. self.previous_element = previous_element
  203. if previous_element is not None:
  204. self.previous_element.next_element = self
  205. self.next_element = next_element
  206. if self.next_element:
  207. self.next_element.previous_element = self
  208. self.next_sibling = next_sibling
  209. if self.next_sibling:
  210. self.next_sibling.previous_sibling = self
  211. if (not previous_sibling
  212. and self.parent is not None and self.parent.contents):
  213. previous_sibling = self.parent.contents[-1]
  214. self.previous_sibling = previous_sibling
  215. if previous_sibling:
  216. self.previous_sibling.next_sibling = self
  217. nextSibling = _alias("next_sibling") # BS3
  218. previousSibling = _alias("previous_sibling") # BS3
  219. def replace_with(self, replace_with):
  220. if not self.parent:
  221. raise ValueError(
  222. "Cannot replace one element with another when the"
  223. "element to be replaced is not part of a tree.")
  224. if replace_with is self:
  225. return
  226. if replace_with is self.parent:
  227. raise ValueError("Cannot replace a Tag with its parent.")
  228. old_parent = self.parent
  229. my_index = self.parent.index(self)
  230. self.extract()
  231. old_parent.insert(my_index, replace_with)
  232. return self
  233. replaceWith = replace_with # BS3
  234. def unwrap(self):
  235. my_parent = self.parent
  236. if not self.parent:
  237. raise ValueError(
  238. "Cannot replace an element with its contents when that"
  239. "element is not part of a tree.")
  240. my_index = self.parent.index(self)
  241. self.extract()
  242. for child in reversed(self.contents[:]):
  243. my_parent.insert(my_index, child)
  244. return self
  245. replace_with_children = unwrap
  246. replaceWithChildren = unwrap # BS3
  247. def wrap(self, wrap_inside):
  248. me = self.replace_with(wrap_inside)
  249. wrap_inside.append(me)
  250. return wrap_inside
  251. def extract(self):
  252. """Destructively rips this element out of the tree."""
  253. if self.parent is not None:
  254. del self.parent.contents[self.parent.index(self)]
  255. #Find the two elements that would be next to each other if
  256. #this element (and any children) hadn't been parsed. Connect
  257. #the two.
  258. last_child = self._last_descendant()
  259. next_element = last_child.next_element
  260. if (self.previous_element is not None and
  261. self.previous_element is not next_element):
  262. self.previous_element.next_element = next_element
  263. if next_element is not None and next_element is not self.previous_element:
  264. next_element.previous_element = self.previous_element
  265. self.previous_element = None
  266. last_child.next_element = None
  267. self.parent = None
  268. if (self.previous_sibling is not None
  269. and self.previous_sibling is not self.next_sibling):
  270. self.previous_sibling.next_sibling = self.next_sibling
  271. if (self.next_sibling is not None
  272. and self.next_sibling is not self.previous_sibling):
  273. self.next_sibling.previous_sibling = self.previous_sibling
  274. self.previous_sibling = self.next_sibling = None
  275. return self
  276. def _last_descendant(self, is_initialized=True, accept_self=True):
  277. "Finds the last element beneath this object to be parsed."
  278. if is_initialized and self.next_sibling:
  279. last_child = self.next_sibling.previous_element
  280. else:
  281. last_child = self
  282. while isinstance(last_child, Tag) and last_child.contents:
  283. last_child = last_child.contents[-1]
  284. if not accept_self and last_child is self:
  285. last_child = None
  286. return last_child
  287. # BS3: Not part of the API!
  288. _lastRecursiveChild = _last_descendant
  289. def insert(self, position, new_child):
  290. if new_child is None:
  291. raise ValueError("Cannot insert None into a tag.")
  292. if new_child is self:
  293. raise ValueError("Cannot insert a tag into itself.")
  294. if (isinstance(new_child, str)
  295. and not isinstance(new_child, NavigableString)):
  296. new_child = NavigableString(new_child)
  297. from bs4 import BeautifulSoup
  298. if isinstance(new_child, BeautifulSoup):
  299. # We don't want to end up with a situation where one BeautifulSoup
  300. # object contains another. Insert the children one at a time.
  301. for subchild in list(new_child.contents):
  302. self.insert(position, subchild)
  303. position += 1
  304. return
  305. position = min(position, len(self.contents))
  306. if hasattr(new_child, 'parent') and new_child.parent is not None:
  307. # We're 'inserting' an element that's already one
  308. # of this object's children.
  309. if new_child.parent is self:
  310. current_index = self.index(new_child)
  311. if current_index < position:
  312. # We're moving this element further down the list
  313. # of this object's children. That means that when
  314. # we extract this element, our target index will
  315. # jump down one.
  316. position -= 1
  317. new_child.extract()
  318. new_child.parent = self
  319. previous_child = None
  320. if position == 0:
  321. new_child.previous_sibling = None
  322. new_child.previous_element = self
  323. else:
  324. previous_child = self.contents[position - 1]
  325. new_child.previous_sibling = previous_child
  326. new_child.previous_sibling.next_sibling = new_child
  327. new_child.previous_element = previous_child._last_descendant(False)
  328. if new_child.previous_element is not None:
  329. new_child.previous_element.next_element = new_child
  330. new_childs_last_element = new_child._last_descendant(False)
  331. if position >= len(self.contents):
  332. new_child.next_sibling = None
  333. parent = self
  334. parents_next_sibling = None
  335. while parents_next_sibling is None and parent is not None:
  336. parents_next_sibling = parent.next_sibling
  337. parent = parent.parent
  338. if parents_next_sibling is not None:
  339. # We found the element that comes next in the document.
  340. break
  341. if parents_next_sibling is not None:
  342. new_childs_last_element.next_element = parents_next_sibling
  343. else:
  344. # The last element of this tag is the last element in
  345. # the document.
  346. new_childs_last_element.next_element = None
  347. else:
  348. next_child = self.contents[position]
  349. new_child.next_sibling = next_child
  350. if new_child.next_sibling is not None:
  351. new_child.next_sibling.previous_sibling = new_child
  352. new_childs_last_element.next_element = next_child
  353. if new_childs_last_element.next_element is not None:
  354. new_childs_last_element.next_element.previous_element = new_childs_last_element
  355. self.contents.insert(position, new_child)
  356. def append(self, tag):
  357. """Appends the given tag to the contents of this tag."""
  358. self.insert(len(self.contents), tag)
  359. def insert_before(self, predecessor):
  360. """Makes the given element the immediate predecessor of this one.
  361. The two elements will have the same parent, and the given element
  362. will be immediately before this one.
  363. """
  364. if self is predecessor:
  365. raise ValueError("Can't insert an element before itself.")
  366. parent = self.parent
  367. if parent is None:
  368. raise ValueError(
  369. "Element has no parent, so 'before' has no meaning.")
  370. # Extract first so that the index won't be screwed up if they
  371. # are siblings.
  372. if isinstance(predecessor, PageElement):
  373. predecessor.extract()
  374. index = parent.index(self)
  375. parent.insert(index, predecessor)
  376. def insert_after(self, successor):
  377. """Makes the given element the immediate successor of this one.
  378. The two elements will have the same parent, and the given element
  379. will be immediately after this one.
  380. """
  381. if self is successor:
  382. raise ValueError("Can't insert an element after itself.")
  383. parent = self.parent
  384. if parent is None:
  385. raise ValueError(
  386. "Element has no parent, so 'after' has no meaning.")
  387. # Extract first so that the index won't be screwed up if they
  388. # are siblings.
  389. if isinstance(successor, PageElement):
  390. successor.extract()
  391. index = parent.index(self)
  392. parent.insert(index+1, successor)
  393. def find_next(self, name=None, attrs={}, text=None, **kwargs):
  394. """Returns the first item that matches the given criteria and
  395. appears after this Tag in the document."""
  396. return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
  397. findNext = find_next # BS3
  398. def find_all_next(self, name=None, attrs={}, text=None, limit=None,
  399. **kwargs):
  400. """Returns all items that match the given criteria and appear
  401. after this Tag in the document."""
  402. return self._find_all(name, attrs, text, limit, self.next_elements,
  403. **kwargs)
  404. findAllNext = find_all_next # BS3
  405. def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
  406. """Returns the closest sibling to this Tag that matches the
  407. given criteria and appears after this Tag in the document."""
  408. return self._find_one(self.find_next_siblings, name, attrs, text,
  409. **kwargs)
  410. findNextSibling = find_next_sibling # BS3
  411. def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
  412. **kwargs):
  413. """Returns the siblings of this Tag that match the given
  414. criteria and appear after this Tag in the document."""
  415. return self._find_all(name, attrs, text, limit,
  416. self.next_siblings, **kwargs)
  417. findNextSiblings = find_next_siblings # BS3
  418. fetchNextSiblings = find_next_siblings # BS2
  419. def find_previous(self, name=None, attrs={}, text=None, **kwargs):
  420. """Returns the first item that matches the given criteria and
  421. appears before this Tag in the document."""
  422. return self._find_one(
  423. self.find_all_previous, name, attrs, text, **kwargs)
  424. findPrevious = find_previous # BS3
  425. def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
  426. **kwargs):
  427. """Returns all items that match the given criteria and appear
  428. before this Tag in the document."""
  429. return self._find_all(name, attrs, text, limit, self.previous_elements,
  430. **kwargs)
  431. findAllPrevious = find_all_previous # BS3
  432. fetchPrevious = find_all_previous # BS2
  433. def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
  434. """Returns the closest sibling to this Tag that matches the
  435. given criteria and appears before this Tag in the document."""
  436. return self._find_one(self.find_previous_siblings, name, attrs, text,
  437. **kwargs)
  438. findPreviousSibling = find_previous_sibling # BS3
  439. def find_previous_siblings(self, name=None, attrs={}, text=None,
  440. limit=None, **kwargs):
  441. """Returns the siblings of this Tag that match the given
  442. criteria and appear before this Tag in the document."""
  443. return self._find_all(name, attrs, text, limit,
  444. self.previous_siblings, **kwargs)
  445. findPreviousSiblings = find_previous_siblings # BS3
  446. fetchPreviousSiblings = find_previous_siblings # BS2
  447. def find_parent(self, name=None, attrs={}, **kwargs):
  448. """Returns the closest parent of this Tag that matches the given
  449. criteria."""
  450. # NOTE: We can't use _find_one because findParents takes a different
  451. # set of arguments.
  452. r = None
  453. l = self.find_parents(name, attrs, 1, **kwargs)
  454. if l:
  455. r = l[0]
  456. return r
  457. findParent = find_parent # BS3
  458. def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
  459. """Returns the parents of this Tag that match the given
  460. criteria."""
  461. return self._find_all(name, attrs, None, limit, self.parents,
  462. **kwargs)
  463. findParents = find_parents # BS3
  464. fetchParents = find_parents # BS2
  465. @property
  466. def next(self):
  467. return self.next_element
  468. @property
  469. def previous(self):
  470. return self.previous_element
  471. #These methods do the real heavy lifting.
  472. def _find_one(self, method, name, attrs, text, **kwargs):
  473. r = None
  474. l = method(name, attrs, text, 1, **kwargs)
  475. if l:
  476. r = l[0]
  477. return r
  478. def _find_all(self, name, attrs, text, limit, generator, **kwargs):
  479. "Iterates over a generator looking for things that match."
  480. if text is None and 'string' in kwargs:
  481. text = kwargs['string']
  482. del kwargs['string']
  483. if isinstance(name, SoupStrainer):
  484. strainer = name
  485. else:
  486. strainer = SoupStrainer(name, attrs, text, **kwargs)
  487. if text is None and not limit and not attrs and not kwargs:
  488. if name is True or name is None:
  489. # Optimization to find all tags.
  490. result = (element for element in generator
  491. if isinstance(element, Tag))
  492. return ResultSet(strainer, result)
  493. elif isinstance(name, str):
  494. # Optimization to find all tags with a given name.
  495. if name.count(':') == 1:
  496. # This is a name with a prefix. If this is a namespace-aware document,
  497. # we need to match the local name against tag.name. If not,
  498. # we need to match the fully-qualified name against tag.name.
  499. prefix, local_name = name.split(':', 1)
  500. else:
  501. prefix = None
  502. local_name = name
  503. result = (element for element in generator
  504. if isinstance(element, Tag)
  505. and (
  506. element.name == name
  507. ) or (
  508. element.name == local_name
  509. and (prefix is None or element.prefix == prefix)
  510. )
  511. )
  512. return ResultSet(strainer, result)
  513. results = ResultSet(strainer)
  514. while True:
  515. try:
  516. i = next(generator)
  517. except StopIteration:
  518. break
  519. if i:
  520. found = strainer.search(i)
  521. if found:
  522. results.append(found)
  523. if limit and len(results) >= limit:
  524. break
  525. return results
  526. #These generators can be used to navigate starting from both
  527. #NavigableStrings and Tags.
  528. @property
  529. def next_elements(self):
  530. i = self.next_element
  531. while i is not None:
  532. yield i
  533. i = i.next_element
  534. @property
  535. def next_siblings(self):
  536. i = self.next_sibling
  537. while i is not None:
  538. yield i
  539. i = i.next_sibling
  540. @property
  541. def previous_elements(self):
  542. i = self.previous_element
  543. while i is not None:
  544. yield i
  545. i = i.previous_element
  546. @property
  547. def previous_siblings(self):
  548. i = self.previous_sibling
  549. while i is not None:
  550. yield i
  551. i = i.previous_sibling
  552. @property
  553. def parents(self):
  554. i = self.parent
  555. while i is not None:
  556. yield i
  557. i = i.parent
  558. # Methods for supporting CSS selectors.
  559. tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
  560. # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
  561. # \---------------------------/ \---/\-------------/ \-------/
  562. # | | | |
  563. # | | | The value
  564. # | | ~,|,^,$,* or =
  565. # | Attribute
  566. # Tag
  567. attribselect_re = re.compile(
  568. r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
  569. r'=?"?(?P<value>[^\]"]*)"?\]$'
  570. )
  571. def _attr_value_as_string(self, value, default=None):
  572. """Force an attribute value into a string representation.
  573. A multi-valued attribute will be converted into a
  574. space-separated stirng.
  575. """
  576. value = self.get(value, default)
  577. if isinstance(value, list) or isinstance(value, tuple):
  578. value =" ".join(value)
  579. return value
  580. def _tag_name_matches_and(self, function, tag_name):
  581. if not tag_name:
  582. return function
  583. else:
  584. def _match(tag):
  585. return tag.name == tag_name and function(tag)
  586. return _match
  587. def _attribute_checker(self, operator, attribute, value=''):
  588. """Create a function that performs a CSS selector operation.
  589. Takes an operator, attribute and optional value. Returns a
  590. function that will return True for elements that match that
  591. combination.
  592. """
  593. if operator == '=':
  594. # string representation of `attribute` is equal to `value`
  595. return lambda el: el._attr_value_as_string(attribute) == value
  596. elif operator == '~':
  597. # space-separated list representation of `attribute`
  598. # contains `value`
  599. def _includes_value(element):
  600. attribute_value = element.get(attribute, [])
  601. if not isinstance(attribute_value, list):
  602. attribute_value = attribute_value.split()
  603. return value in attribute_value
  604. return _includes_value
  605. elif operator == '^':
  606. # string representation of `attribute` starts with `value`
  607. return lambda el: el._attr_value_as_string(
  608. attribute, '').startswith(value)
  609. elif operator == '$':
  610. # string representation of `attribute` ends with `value`
  611. return lambda el: el._attr_value_as_string(
  612. attribute, '').endswith(value)
  613. elif operator == '*':
  614. # string representation of `attribute` contains `value`
  615. return lambda el: value in el._attr_value_as_string(attribute, '')
  616. elif operator == '|':
  617. # string representation of `attribute` is either exactly
  618. # `value` or starts with `value` and then a dash.
  619. def _is_or_starts_with_dash(element):
  620. attribute_value = element._attr_value_as_string(attribute, '')
  621. return (attribute_value == value or attribute_value.startswith(
  622. value + '-'))
  623. return _is_or_starts_with_dash
  624. else:
  625. return lambda el: el.has_attr(attribute)
  626. # Old non-property versions of the generators, for backwards
  627. # compatibility with BS3.
  628. def nextGenerator(self):
  629. return self.next_elements
  630. def nextSiblingGenerator(self):
  631. return self.next_siblings
  632. def previousGenerator(self):
  633. return self.previous_elements
  634. def previousSiblingGenerator(self):
  635. return self.previous_siblings
  636. def parentGenerator(self):
  637. return self.parents
  638. class NavigableString(str, PageElement):
  639. PREFIX = ''
  640. SUFFIX = ''
  641. # We can't tell just by looking at a string whether it's contained
  642. # in an XML document or an HTML document.
  643. known_xml = None
  644. def __new__(cls, value):
  645. """Create a new NavigableString.
  646. When unpickling a NavigableString, this method is called with
  647. the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
  648. passed in to the superclass's __new__ or the superclass won't know
  649. how to handle non-ASCII characters.
  650. """
  651. if isinstance(value, str):
  652. u = str.__new__(cls, value)
  653. else:
  654. u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
  655. u.setup()
  656. return u
  657. def __copy__(self):
  658. """A copy of a NavigableString has the same contents and class
  659. as the original, but it is not connected to the parse tree.
  660. """
  661. return type(self)(self)
  662. def __getnewargs__(self):
  663. return (str(self),)
  664. def __getattr__(self, attr):
  665. """text.string gives you text. This is for backwards
  666. compatibility for Navigable*String, but for CData* it lets you
  667. get the string without the CData wrapper."""
  668. if attr == 'string':
  669. return self
  670. else:
  671. raise AttributeError(
  672. "'%s' object has no attribute '%s'" % (
  673. self.__class__.__name__, attr))
  674. def output_ready(self, formatter="minimal"):
  675. output = self.format_string(self, formatter)
  676. return self.PREFIX + output + self.SUFFIX
  677. @property
  678. def name(self):
  679. return None
  680. @name.setter
  681. def name(self, name):
  682. raise AttributeError("A NavigableString cannot be given a name.")
  683. class PreformattedString(NavigableString):
  684. """A NavigableString not subject to the normal formatting rules.
  685. The string will be passed into the formatter (to trigger side effects),
  686. but the return value will be ignored.
  687. """
  688. def output_ready(self, formatter="minimal"):
  689. """CData strings are passed into the formatter.
  690. But the return value is ignored."""
  691. self.format_string(self, formatter)
  692. return self.PREFIX + self + self.SUFFIX
  693. class CData(PreformattedString):
  694. PREFIX = '<![CDATA['
  695. SUFFIX = ']]>'
  696. class ProcessingInstruction(PreformattedString):
  697. """A SGML processing instruction."""
  698. PREFIX = '<?'
  699. SUFFIX = '>'
  700. class XMLProcessingInstruction(ProcessingInstruction):
  701. """An XML processing instruction."""
  702. PREFIX = '<?'
  703. SUFFIX = '?>'
  704. class Comment(PreformattedString):
  705. PREFIX = '<!--'
  706. SUFFIX = '-->'
  707. class Declaration(PreformattedString):
  708. PREFIX = '<?'
  709. SUFFIX = '?>'
  710. class Doctype(PreformattedString):
  711. @classmethod
  712. def for_name_and_ids(cls, name, pub_id, system_id):
  713. value = name or ''
  714. if pub_id is not None:
  715. value += ' PUBLIC "%s"' % pub_id
  716. if system_id is not None:
  717. value += ' "%s"' % system_id
  718. elif system_id is not None:
  719. value += ' SYSTEM "%s"' % system_id
  720. return Doctype(value)
  721. PREFIX = '<!DOCTYPE '
  722. SUFFIX = '>\n'
  723. class Tag(PageElement):
  724. """Represents a found HTML tag with its attributes and contents."""
  725. def __init__(self, parser=None, builder=None, name=None, namespace=None,
  726. prefix=None, attrs=None, parent=None, previous=None,
  727. is_xml=None):
  728. "Basic constructor."
  729. if parser is None:
  730. self.parser_class = None
  731. else:
  732. # We don't actually store the parser object: that lets extracted
  733. # chunks be garbage-collected.
  734. self.parser_class = parser.__class__
  735. if name is None:
  736. raise ValueError("No value provided for new tag's name.")
  737. self.name = name
  738. self.namespace = namespace
  739. self.prefix = prefix
  740. if builder is not None:
  741. preserve_whitespace_tags = builder.preserve_whitespace_tags
  742. else:
  743. if is_xml:
  744. preserve_whitespace_tags = []
  745. else:
  746. preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
  747. self.preserve_whitespace_tags = preserve_whitespace_tags
  748. if attrs is None:
  749. attrs = {}
  750. elif attrs:
  751. if builder is not None and builder.cdata_list_attributes:
  752. attrs = builder._replace_cdata_list_attribute_values(
  753. self.name, attrs)
  754. else:
  755. attrs = dict(attrs)
  756. else:
  757. attrs = dict(attrs)
  758. # If possible, determine ahead of time whether this tag is an
  759. # XML tag.
  760. if builder:
  761. self.known_xml = builder.is_xml
  762. else:
  763. self.known_xml = is_xml
  764. self.attrs = attrs
  765. self.contents = []
  766. self.setup(parent, previous)
  767. self.hidden = False
  768. # Set up any substitutions, such as the charset in a META tag.
  769. if builder is not None:
  770. builder.set_up_substitutions(self)
  771. self.can_be_empty_element = builder.can_be_empty_element(name)
  772. else:
  773. self.can_be_empty_element = False
  774. parserClass = _alias("parser_class") # BS3
  775. def __copy__(self):
  776. """A copy of a Tag is a new Tag, unconnected to the parse tree.
  777. Its contents are a copy of the old Tag's contents.
  778. """
  779. clone = type(self)(None, self.builder, self.name, self.namespace,
  780. self.prefix, self.attrs, is_xml=self._is_xml)
  781. for attr in ('can_be_empty_element', 'hidden'):
  782. setattr(clone, attr, getattr(self, attr))
  783. for child in self.contents:
  784. clone.append(child.__copy__())
  785. return clone
  786. @property
  787. def is_empty_element(self):
  788. """Is this tag an empty-element tag? (aka a self-closing tag)
  789. A tag that has contents is never an empty-element tag.
  790. A tag that has no contents may or may not be an empty-element
  791. tag. It depends on the builder used to create the tag. If the
  792. builder has a designated list of empty-element tags, then only
  793. a tag whose name shows up in that list is considered an
  794. empty-element tag.
  795. If the builder has no designated list of empty-element tags,
  796. then any tag with no contents is an empty-element tag.
  797. """
  798. return len(self.contents) == 0 and self.can_be_empty_element
  799. isSelfClosing = is_empty_element # BS3
  800. @property
  801. def string(self):
  802. """Convenience property to get the single string within this tag.
  803. :Return: If this tag has a single string child, return value
  804. is that string. If this tag has no children, or more than one
  805. child, return value is None. If this tag has one child tag,
  806. return value is the 'string' attribute of the child tag,
  807. recursively.
  808. """
  809. if len(self.contents) != 1:
  810. return None
  811. child = self.contents[0]
  812. if isinstance(child, NavigableString):
  813. return child
  814. return child.string
  815. @string.setter
  816. def string(self, string):
  817. self.clear()
  818. self.append(string.__class__(string))
  819. def _all_strings(self, strip=False, types=(NavigableString, CData)):
  820. """Yield all strings of certain classes, possibly stripping them.
  821. By default, yields only NavigableString and CData objects. So
  822. no comments, processing instructions, etc.
  823. """
  824. for descendant in self.descendants:
  825. if (
  826. (types is None and not isinstance(descendant, NavigableString))
  827. or
  828. (types is not None and type(descendant) not in types)):
  829. continue
  830. if strip:
  831. descendant = descendant.strip()
  832. if len(descendant) == 0:
  833. continue
  834. yield descendant
  835. strings = property(_all_strings)
  836. @property
  837. def stripped_strings(self):
  838. for string in self._all_strings(True):
  839. yield string
  840. def get_text(self, separator="", strip=False,
  841. types=(NavigableString, CData)):
  842. """
  843. Get all child strings, concatenated using the given separator.
  844. """
  845. return separator.join([s for s in self._all_strings(
  846. strip, types=types)])
  847. getText = get_text
  848. text = property(get_text)
  849. def decompose(self):
  850. """Recursively destroys the contents of this tree."""
  851. self.extract()
  852. i = self
  853. while i is not None:
  854. next = i.next_element
  855. i.__dict__.clear()
  856. i.contents = []
  857. i = next
  858. def clear(self, decompose=False):
  859. """
  860. Extract all children. If decompose is True, decompose instead.
  861. """
  862. if decompose:
  863. for element in self.contents[:]:
  864. if isinstance(element, Tag):
  865. element.decompose()
  866. else:
  867. element.extract()
  868. else:
  869. for element in self.contents[:]:
  870. element.extract()
  871. def index(self, element):
  872. """
  873. Find the index of a child by identity, not value. Avoids issues with
  874. tag.contents.index(element) getting the index of equal elements.
  875. """
  876. for i, child in enumerate(self.contents):
  877. if child is element:
  878. return i
  879. raise ValueError("Tag.index: element not in tag")
  880. def get(self, key, default=None):
  881. """Returns the value of the 'key' attribute for the tag, or
  882. the value given for 'default' if it doesn't have that
  883. attribute."""
  884. return self.attrs.get(key, default)
  885. def get_attribute_list(self, key, default=None):
  886. """The same as get(), but always returns a list."""
  887. value = self.get(key, default)
  888. if not isinstance(value, list):
  889. value = [value]
  890. return value
  891. def has_attr(self, key):
  892. return key in self.attrs
  893. def __hash__(self):
  894. return str(self).__hash__()
  895. def __getitem__(self, key):
  896. """tag[key] returns the value of the 'key' attribute for the tag,
  897. and throws an exception if it's not there."""
  898. return self.attrs[key]
  899. def __iter__(self):
  900. "Iterating over a tag iterates over its contents."
  901. return iter(self.contents)
  902. def __len__(self):
  903. "The length of a tag is the length of its list of contents."
  904. return len(self.contents)
  905. def __contains__(self, x):
  906. return x in self.contents
  907. def __bool__(self):
  908. "A tag is non-None even if it has no contents."
  909. return True
  910. def __setitem__(self, key, value):
  911. """Setting tag[key] sets the value of the 'key' attribute for the
  912. tag."""
  913. self.attrs[key] = value
  914. def __delitem__(self, key):
  915. "Deleting tag[key] deletes all 'key' attributes for the tag."
  916. self.attrs.pop(key, None)
  917. def __call__(self, *args, **kwargs):
  918. """Calling a tag like a function is the same as calling its
  919. find_all() method. Eg. tag('a') returns a list of all the A tags
  920. found within this tag."""
  921. return self.find_all(*args, **kwargs)
  922. def __getattr__(self, tag):
  923. #print "Getattr %s.%s" % (self.__class__, tag)
  924. if len(tag) > 3 and tag.endswith('Tag'):
  925. # BS3: soup.aTag -> "soup.find("a")
  926. tag_name = tag[:-3]
  927. warnings.warn(
  928. '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
  929. name=tag_name
  930. )
  931. )
  932. return self.find(tag_name)
  933. # We special case contents to avoid recursion.
  934. elif not tag.startswith("__") and not tag == "contents":
  935. return self.find(tag)
  936. raise AttributeError(
  937. "'%s' object has no attribute '%s'" % (self.__class__, tag))
  938. def __eq__(self, other):
  939. """Returns true iff this tag has the same name, the same attributes,
  940. and the same contents (recursively) as the given tag."""
  941. if self is other:
  942. return True
  943. if (not hasattr(other, 'name') or
  944. not hasattr(other, 'attrs') or
  945. not hasattr(other, 'contents') or
  946. self.name != other.name or
  947. self.attrs != other.attrs or
  948. len(self) != len(other)):
  949. return False
  950. for i, my_child in enumerate(self.contents):
  951. if my_child != other.contents[i]:
  952. return False
  953. return True
  954. def __ne__(self, other):
  955. """Returns true iff this tag is not identical to the other tag,
  956. as defined in __eq__."""
  957. return not self == other
  958. def __repr__(self, encoding="unicode-escape"):
  959. """Renders this tag as a string."""
  960. if PY3K:
  961. # "The return value must be a string object", i.e. Unicode
  962. return self.decode()
  963. else:
  964. # "The return value must be a string object", i.e. a bytestring.
  965. # By convention, the return value of __repr__ should also be
  966. # an ASCII string.
  967. return self.encode(encoding)
  968. def __unicode__(self):
  969. return self.decode()
  970. def __str__(self):
  971. if PY3K:
  972. return self.decode()
  973. else:
  974. return self.encode()
  975. if PY3K:
  976. __str__ = __repr__ = __unicode__
  977. def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
  978. indent_level=None, formatter="minimal",
  979. errors="xmlcharrefreplace"):
  980. # Turn the data structure into Unicode, then encode the
  981. # Unicode.
  982. u = self.decode(indent_level, encoding, formatter)
  983. return u.encode(encoding, errors)
  984. def _should_pretty_print(self, indent_level):
  985. """Should this tag be pretty-printed?"""
  986. return (
  987. indent_level is not None
  988. and self.name not in self.preserve_whitespace_tags
  989. )
  990. def decode(self, indent_level=None,
  991. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  992. formatter="minimal"):
  993. """Returns a Unicode representation of this tag and its contents.
  994. :param eventual_encoding: The tag is destined to be
  995. encoded into this encoding. This method is _not_
  996. responsible for performing that encoding. This information
  997. is passed in so that it can be substituted in if the
  998. document contains a <META> tag that mentions the document's
  999. encoding.
  1000. """
  1001. # First off, turn a string formatter into a Formatter object. This
  1002. # will stop the lookup from happening over and over again.
  1003. if not isinstance(formatter, Formatter) and not callable(formatter):
  1004. formatter = self._formatter_for_name(formatter)
  1005. attrs = []
  1006. if self.attrs:
  1007. for key, val in sorted(self.attrs.items()):
  1008. if val is None:
  1009. decoded = key
  1010. else:
  1011. if isinstance(val, list) or isinstance(val, tuple):
  1012. val = ' '.join(val)
  1013. elif not isinstance(val, str):
  1014. val = str(val)
  1015. elif (
  1016. isinstance(val, AttributeValueWithCharsetSubstitution)
  1017. and eventual_encoding is not None):
  1018. val = val.encode(eventual_encoding)
  1019. text = self.format_string(val, formatter)
  1020. decoded = (
  1021. str(key) + '='
  1022. + EntitySubstitution.quoted_attribute_value(text))
  1023. attrs.append(decoded)
  1024. close = ''
  1025. closeTag = ''
  1026. prefix = ''
  1027. if self.prefix:
  1028. prefix = self.prefix + ":"
  1029. if self.is_empty_element:
  1030. close = ''
  1031. if isinstance(formatter, Formatter):
  1032. close = formatter.void_element_close_prefix or close
  1033. else:
  1034. closeTag = '</%s%s>' % (prefix, self.name)
  1035. pretty_print = self._should_pretty_print(indent_level)
  1036. space = ''
  1037. indent_space = ''
  1038. if indent_level is not None:
  1039. indent_space = (' ' * (indent_level - 1))
  1040. if pretty_print:
  1041. space = indent_space
  1042. indent_contents = indent_level + 1
  1043. else:
  1044. indent_contents = None
  1045. contents = self.decode_contents(
  1046. indent_contents, eventual_encoding, formatter)
  1047. if self.hidden:
  1048. # This is the 'document root' object.
  1049. s = contents
  1050. else:
  1051. s = []
  1052. attribute_string = ''
  1053. if attrs:
  1054. attribute_string = ' ' + ' '.join(attrs)
  1055. if indent_level is not None:
  1056. # Even if this particular tag is not pretty-printed,
  1057. # we should indent up to the start of the tag.
  1058. s.append(indent_space)
  1059. s.append('<%s%s%s%s>' % (
  1060. prefix, self.name, attribute_string, close))
  1061. if pretty_print:
  1062. s.append("\n")
  1063. s.append(contents)
  1064. if pretty_print and contents and contents[-1] != "\n":
  1065. s.append("\n")
  1066. if pretty_print and closeTag:
  1067. s.append(space)
  1068. s.append(closeTag)
  1069. if indent_level is not None and closeTag and self.next_sibling:
  1070. # Even if this particular tag is not pretty-printed,
  1071. # we're now done with the tag, and we should add a
  1072. # newline if appropriate.
  1073. s.append("\n")
  1074. s = ''.join(s)
  1075. return s
  1076. def prettify(self, encoding=None, formatter="minimal"):
  1077. if encoding is None:
  1078. return self.decode(True, formatter=formatter)
  1079. else:
  1080. return self.encode(encoding, True, formatter=formatter)
  1081. def decode_contents(self, indent_level=None,
  1082. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  1083. formatter="minimal"):
  1084. """Renders the contents of this tag as a Unicode string.
  1085. :param indent_level: Each line of the rendering will be
  1086. indented this many spaces.
  1087. :param eventual_encoding: The tag is destined to be
  1088. encoded into this encoding. This method is _not_
  1089. responsible for performing that encoding. This information
  1090. is passed in so that it can be substituted in if the
  1091. document contains a <META> tag that mentions the document's
  1092. encoding.
  1093. :param formatter: The output formatter responsible for converting
  1094. entities to Unicode characters.
  1095. """
  1096. # First off, turn a string formatter into a Formatter object. This
  1097. # will stop the lookup from happening over and over again.
  1098. if not isinstance(formatter, Formatter) and not callable(formatter):
  1099. formatter = self._formatter_for_name(formatter)
  1100. pretty_print = (indent_level is not None)
  1101. s = []
  1102. for c in self:
  1103. text = None
  1104. if isinstance(c, NavigableString):
  1105. text = c.output_ready(formatter)
  1106. elif isinstance(c, Tag):
  1107. s.append(c.decode(indent_level, eventual_encoding,
  1108. formatter))
  1109. if text and indent_level and not self.name == 'pre':
  1110. text = text.strip()
  1111. if text:
  1112. if pretty_print and not self.name == 'pre':
  1113. s.append(" " * (indent_level - 1))
  1114. s.append(text)
  1115. if pretty_print and not self.name == 'pre':
  1116. s.append("\n")
  1117. return ''.join(s)
  1118. def encode_contents(
  1119. self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
  1120. formatter="minimal"):
  1121. """Renders the contents of this tag as a bytestring.
  1122. :param indent_level: Each line of the rendering will be
  1123. indented this many spaces.
  1124. :param eventual_encoding: The bytestring will be in this encoding.
  1125. :param formatter: The output formatter responsible for converting
  1126. entities to Unicode characters.
  1127. """
  1128. contents = self.decode_contents(indent_level, encoding, formatter)
  1129. return contents.encode(encoding)
  1130. # Old method for BS3 compatibility
  1131. def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
  1132. prettyPrint=False, indentLevel=0):
  1133. if not prettyPrint:
  1134. indentLevel = None
  1135. return self.encode_contents(
  1136. indent_level=indentLevel, encoding=encoding)
  1137. #Soup methods
  1138. def find(self, name=None, attrs={}, recursive=True, text=None,
  1139. **kwargs):
  1140. """Return only the first child of this Tag matching the given
  1141. criteria."""
  1142. r = None
  1143. l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
  1144. if l:
  1145. r = l[0]
  1146. return r
  1147. findChild = find
  1148. def find_all(self, name=None, attrs={}, recursive=True, text=None,
  1149. limit=None, **kwargs):
  1150. """Extracts a list of Tag objects that match the given
  1151. criteria. You can specify the name of the Tag and any
  1152. attributes you want the Tag to have.
  1153. The value of a key-value pair in the 'attrs' map can be a
  1154. string, a list of strings, a regular expression object, or a
  1155. callable that takes a string and returns whether or not the
  1156. string matches for some custom definition of 'matches'. The
  1157. same is true of the tag name."""
  1158. generator = self.descendants
  1159. if not recursive:
  1160. generator = self.children
  1161. return self._find_all(name, attrs, text, limit, generator, **kwargs)
  1162. findAll = find_all # BS3
  1163. findChildren = find_all # BS2
  1164. #Generator methods
  1165. @property
  1166. def children(self):
  1167. # return iter() to make the purpose of the method clear
  1168. return iter(self.contents) # XXX This seems to be untested.
  1169. @property
  1170. def descendants(self):
  1171. if not len(self.contents):
  1172. return
  1173. stopNode = self._last_descendant().next_element
  1174. current = self.contents[0]
  1175. while current is not stopNode:
  1176. yield current
  1177. current = current.next_element
  1178. # CSS selector code
  1179. _selector_combinators = ['>', '+', '~']
  1180. _select_debug = False
  1181. quoted_colon = re.compile('"[^"]*:[^"]*"')
  1182. def select_one(self, selector):
  1183. """Perform a CSS selection operation on the current element."""
  1184. value = self.select(selector, limit=1)
  1185. if value:
  1186. return value[0]
  1187. return None
  1188. def select(self, selector, _candidate_generator=None, limit=None):
  1189. """Perform a CSS selection operation on the current element."""
  1190. # Handle grouping selectors if ',' exists, ie: p,a
  1191. if ',' in selector:
  1192. context = []
  1193. selectors = [x.strip() for x in selector.split(",")]
  1194. # If a selector is mentioned multiple times we don't want
  1195. # to use it more than once.
  1196. used_selectors = set()
  1197. # We also don't want to select the same element more than once,
  1198. # if it's matched by multiple selectors.
  1199. selected_object_ids = set()
  1200. for partial_selector in selectors:
  1201. if partial_selector == '':
  1202. raise ValueError('Invalid group selection syntax: %s' % selector)
  1203. if partial_selector in used_selectors:
  1204. continue
  1205. used_selectors.add(partial_selector)
  1206. candidates = self.select(partial_selector, limit=limit)
  1207. for candidate in candidates:
  1208. # This lets us distinguish between distinct tags that
  1209. # represent the same markup.
  1210. object_id = id(candidate)
  1211. if object_id not in selected_object_ids:
  1212. context.append(candidate)
  1213. selected_object_ids.add(object_id)
  1214. if limit and len(context) >= limit:
  1215. break
  1216. return context
  1217. tokens = shlex.split(selector)
  1218. current_context = [self]
  1219. if tokens[-1] in self._selector_combinators:
  1220. raise ValueError(
  1221. 'Final combinator "%s" is missing an argument.' % tokens[-1])
  1222. if self._select_debug:
  1223. print('Running CSS selector "%s"' % selector)
  1224. for index, token in enumerate(tokens):
  1225. new_context = []
  1226. new_context_ids = set([])
  1227. if tokens[index-1] in self._selector_combinators:
  1228. # This token was consumed by the previous combinator. Skip it.
  1229. if self._select_debug:
  1230. print(' Token was consumed by the previous combinator.')
  1231. continue
  1232. if self._select_debug:
  1233. print(' Considering token "%s"' % token)
  1234. recursive_candidate_generator = None
  1235. tag_name = None
  1236. # Each operation corresponds to a checker function, a rule
  1237. # for determining whether a candidate matches the
  1238. # selector. Candidates are generated by the active
  1239. # iterator.
  1240. checker = None
  1241. m = self.attribselect_re.match(token)
  1242. if m is not None:
  1243. # Attribute selector
  1244. tag_name, attribute, operator, value = m.groups()
  1245. checker = self._attribute_checker(operator, attribute, value)
  1246. elif '#' in token:
  1247. # ID selector
  1248. tag_name, tag_id = token.split('#', 1)
  1249. def id_matches(tag):
  1250. return tag.get('id', None) == tag_id
  1251. checker = id_matches
  1252. elif '.' in token:
  1253. # Class selector
  1254. tag_name, klass = token.split('.', 1)
  1255. classes = set(klass.split('.'))
  1256. def classes_match(candidate):
  1257. return classes.issubset(candidate.get('class', []))
  1258. checker = classes_match
  1259. elif ':' in token and not self.quoted_colon.search(token):
  1260. # Pseudo-class
  1261. tag_name, pseudo = token.split(':', 1)
  1262. if tag_name == '':
  1263. raise ValueError(
  1264. "A pseudo-class must be prefixed with a tag name.")
  1265. pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
  1266. found = []
  1267. if pseudo_attributes is None:
  1268. pseudo_type = pseudo
  1269. pseudo_value = None
  1270. else:
  1271. pseudo_type, pseudo_value = pseudo_attributes.groups()
  1272. if pseudo_type == 'nth-of-type':
  1273. try:
  1274. pseudo_value = int(pseudo_value)
  1275. except:
  1276. raise NotImplementedError(
  1277. 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
  1278. if pseudo_value < 1:
  1279. raise ValueError(
  1280. 'nth-of-type pseudo-class value must be at least 1.')
  1281. class Counter(object):
  1282. def __init__(self, destination):
  1283. self.count = 0
  1284. self.destination = destination
  1285. def nth_child_of_type(self, tag):
  1286. self.count += 1
  1287. if self.count == self.destination:
  1288. return True
  1289. else:
  1290. return False
  1291. checker = Counter(pseudo_value).nth_child_of_type
  1292. else:
  1293. raise NotImplementedError(
  1294. 'Only the following pseudo-classes are implemented: nth-of-type.')
  1295. elif token == '*':
  1296. # Star selector -- matches everything
  1297. pass
  1298. elif token == '>':
  1299. # Run the next token as a CSS selector against the
  1300. # direct children of each tag in the current context.
  1301. recursive_candidate_generator = lambda tag: tag.children
  1302. elif token == '~':
  1303. # Run the next token as a CSS selector against the
  1304. # siblings of each tag in the current context.
  1305. recursive_candidate_generator = lambda tag: tag.next_siblings
  1306. elif token == '+':
  1307. # For each tag in the current context, run the next
  1308. # token as a CSS selector against the tag's next
  1309. # sibling that's a tag.
  1310. def next_tag_sibling(tag):
  1311. yield tag.find_next_sibling(True)
  1312. recursive_candidate_generator = next_tag_sibling
  1313. elif self.tag_name_re.match(token):
  1314. # Just a tag name.
  1315. tag_name = token
  1316. else:
  1317. raise ValueError(
  1318. 'Unsupported or invalid CSS selector: "%s"' % token)
  1319. if recursive_candidate_generator:
  1320. # This happens when the selector looks like "> foo".
  1321. #
  1322. # The generator calls select() recursively on every
  1323. # member of the current context, passing in a different
  1324. # candidate generator and a different selector.
  1325. #
  1326. # In the case of "> foo", the candidate generator is
  1327. # one that yields a tag's direct children (">"), and
  1328. # the selector is "foo".
  1329. next_token = tokens[index+1]
  1330. def recursive_select(tag):
  1331. if self._select_debug:
  1332. print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
  1333. print('-' * 40)
  1334. for i in tag.select(next_token, recursive_candidate_generator):
  1335. if self._select_debug:
  1336. print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
  1337. yield i
  1338. if self._select_debug:
  1339. print('-' * 40)
  1340. _use_candidate_generator = recursive_select
  1341. elif _candidate_generator is None:
  1342. # By default, a tag's candidates are all of its
  1343. # children. If tag_name is defined, only yield tags
  1344. # with that name.
  1345. if self._select_debug:
  1346. if tag_name:
  1347. check = "[any]"
  1348. else:
  1349. check = tag_name
  1350. print(' Default candidate generator, tag name="%s"' % check)
  1351. if self._select_debug:
  1352. # This is redundant with later code, but it stops
  1353. # a bunch of bogus tags from cluttering up the
  1354. # debug log.
  1355. def default_candidate_generator(tag):
  1356. for child in tag.descendants:
  1357. if not isinstance(child, Tag):
  1358. continue
  1359. if tag_name and not child.name == tag_name:
  1360. continue
  1361. yield child
  1362. _use_candidate_generator = default_candidate_generator
  1363. else:
  1364. _use_candidate_generator = lambda tag: tag.descendants
  1365. else:
  1366. _use_candidate_generator = _candidate_generator
  1367. count = 0
  1368. for tag in current_context:
  1369. if self._select_debug:
  1370. print(" Running candidate generator on %s %s" % (
  1371. tag.name, repr(tag.attrs)))
  1372. for candidate in _use_candidate_generator(tag):
  1373. if not isinstance(candidate, Tag):
  1374. continue
  1375. if tag_name and candidate.name != tag_name:
  1376. continue
  1377. if checker is not None:
  1378. try:
  1379. result = checker(candidate)
  1380. except StopIteration:
  1381. # The checker has decided we should no longer
  1382. # run the generator.
  1383. break
  1384. if checker is None or result:
  1385. if self._select_debug:
  1386. print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
  1387. if id(candidate) not in new_context_ids:
  1388. # If a tag matches a selector more than once,
  1389. # don't include it in the context more than once.
  1390. new_context.append(candidate)
  1391. new_context_ids.add(id(candidate))
  1392. elif self._select_debug:
  1393. print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
  1394. current_context = new_context
  1395. if limit and len(current_context) >= limit:
  1396. current_context = current_context[:limit]
  1397. if self._select_debug:
  1398. print("Final verdict:")
  1399. for i in current_context:
  1400. print(" %s %s" % (i.name, i.attrs))
  1401. return current_context
  1402. # Old names for backwards compatibility
  1403. def childGenerator(self):
  1404. return self.children
  1405. def recursiveChildGenerator(self):
  1406. return self.descendants
  1407. def has_key(self, key):
  1408. """This was kind of misleading because has_key() (attributes)
  1409. was different from __in__ (contents). has_key() is gone in
  1410. Python 3, anyway."""
  1411. warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
  1412. key))
  1413. return self.has_attr(key)
  1414. # Next, a couple classes to represent queries and their results.
  1415. class SoupStrainer(object):
  1416. """Encapsulates a number of ways of matching a markup element (tag or
  1417. text)."""
  1418. def __init__(self, name=None, attrs={}, text=None, **kwargs):
  1419. self.name = self._normalize_search_value(name)
  1420. if not isinstance(attrs, dict):
  1421. # Treat a non-dict value for attrs as a search for the 'class'
  1422. # attribute.
  1423. kwargs['class'] = attrs
  1424. attrs = None
  1425. if 'class_' in kwargs:
  1426. # Treat class_="foo" as a search for the 'class'
  1427. # attribute, overriding any non-dict value for attrs.
  1428. kwargs['class'] = kwargs['class_']
  1429. del kwargs['class_']
  1430. if kwargs:
  1431. if attrs:
  1432. attrs = attrs.copy()
  1433. attrs.update(kwargs)
  1434. else:
  1435. attrs = kwargs
  1436. normalized_attrs = {}
  1437. for key, value in list(attrs.items()):
  1438. normalized_attrs[key] = self._normalize_search_value(value)
  1439. self.attrs = normalized_attrs
  1440. self.text = self._normalize_search_value(text)
  1441. def _normalize_search_value(self, value):
  1442. # Leave it alone if it's a Unicode string, a callable, a
  1443. # regular expression, a boolean, or None.
  1444. if (isinstance(value, str) or callable(value) or hasattr(value, 'match')
  1445. or isinstance(value, bool) or value is None):
  1446. return value
  1447. # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
  1448. if isinstance(value, bytes):
  1449. return value.decode("utf8")
  1450. # If it's listlike, convert it into a list of strings.
  1451. if hasattr(value, '__iter__'):
  1452. new_value = []
  1453. for v in value:
  1454. if (hasattr(v, '__iter__') and not isinstance(v, bytes)
  1455. and not isinstance(v, str)):
  1456. # This is almost certainly the user's mistake. In the
  1457. # interests of avoiding infinite loops, we'll let
  1458. # it through as-is rather than doing a recursive call.
  1459. new_value.append(v)
  1460. else:
  1461. new_value.append(self._normalize_search_value(v))
  1462. return new_value
  1463. # Otherwise, convert it into a Unicode string.
  1464. # The unicode(str()) thing is so this will do the same thing on Python 2
  1465. # and Python 3.
  1466. return str(str(value))
  1467. def __str__(self):
  1468. if self.text:
  1469. return self.text
  1470. else:
  1471. return "%s|%s" % (self.name, self.attrs)
  1472. def search_tag(self, markup_name=None, markup_attrs={}):
  1473. found = None
  1474. markup = None
  1475. if isinstance(markup_name, Tag):
  1476. markup = markup_name
  1477. markup_attrs = markup
  1478. call_function_with_tag_data = (
  1479. isinstance(self.name, Callable)
  1480. and not isinstance(markup_name, Tag))
  1481. if ((not self.name)
  1482. or call_function_with_tag_data
  1483. or (markup and self._matches(markup, self.name))
  1484. or (not markup and self._matches(markup_name, self.name))):
  1485. if call_function_with_tag_data:
  1486. match = self.name(markup_name, markup_attrs)
  1487. else:
  1488. match = True
  1489. markup_attr_map = None
  1490. for attr, match_against in list(self.attrs.items()):
  1491. if not markup_attr_map:
  1492. if hasattr(markup_attrs, 'get'):
  1493. markup_attr_map = markup_attrs
  1494. else:
  1495. markup_attr_map = {}
  1496. for k, v in markup_attrs:
  1497. markup_attr_map[k] = v
  1498. attr_value = markup_attr_map.get(attr)
  1499. if not self._matches(attr_value, match_against):
  1500. match = False
  1501. break
  1502. if match:
  1503. if markup:
  1504. found = markup
  1505. else:
  1506. found = markup_name
  1507. if found and self.text and not self._matches(found.string, self.text):
  1508. found = None
  1509. return found
  1510. searchTag = search_tag
  1511. def search(self, markup):
  1512. # print 'looking for %s in %s' % (self, markup)
  1513. found = None
  1514. # If given a list of items, scan it for a text element that
  1515. # matches.
  1516. if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
  1517. for element in markup:
  1518. if isinstance(element, NavigableString) \
  1519. and self.search(element):
  1520. found = element
  1521. break
  1522. # If it's a Tag, make sure its name or attributes match.
  1523. # Don't bother with Tags if we're searching for text.
  1524. elif isinstance(markup, Tag):
  1525. if not self.text or self.name or self.attrs:
  1526. found = self.search_tag(markup)
  1527. # If it's text, make sure the text matches.
  1528. elif isinstance(markup, NavigableString) or \
  1529. isinstance(markup, str):
  1530. if not self.name and not self.attrs and self._matches(markup, self.text):
  1531. found = markup
  1532. else:
  1533. raise Exception(
  1534. "I don't know how to match against a %s" % markup.__class__)
  1535. return found
  1536. def _matches(self, markup, match_against, already_tried=None):
  1537. # print u"Matching %s against %s" % (markup, match_against)
  1538. result = False
  1539. if isinstance(markup, list) or isinstance(markup, tuple):
  1540. # This should only happen when searching a multi-valued attribute
  1541. # like 'class'.
  1542. for item in markup:
  1543. if self._matches(item, match_against):
  1544. return True
  1545. # We didn't match any particular value of the multivalue
  1546. # attribute, but maybe we match the attribute value when
  1547. # considered as a string.
  1548. if self._matches(' '.join(markup), match_against):
  1549. return True
  1550. return False
  1551. if match_against is True:
  1552. # True matches any non-None value.
  1553. return markup is not None
  1554. if isinstance(match_against, Callable):
  1555. return match_against(markup)
  1556. # Custom callables take the tag as an argument, but all
  1557. # other ways of matching match the tag name as a string.
  1558. original_markup = markup
  1559. if isinstance(markup, Tag):
  1560. markup = markup.name
  1561. # Ensure that `markup` is either a Unicode string, or None.
  1562. markup = self._normalize_search_value(markup)
  1563. if markup is None:
  1564. # None matches None, False, an empty string, an empty list, and so on.
  1565. return not match_against
  1566. if (hasattr(match_against, '__iter__')
  1567. and not isinstance(match_against, str)):
  1568. # We're asked to match against an iterable of items.
  1569. # The markup must be match at least one item in the
  1570. # iterable. We'll try each one in turn.
  1571. #
  1572. # To avoid infinite recursion we need to keep track of
  1573. # items we've already seen.
  1574. if not already_tried:
  1575. already_tried = set()
  1576. for item in match_against:
  1577. if item.__hash__:
  1578. key = item
  1579. else:
  1580. key = id(item)
  1581. if key in already_tried:
  1582. continue
  1583. else:
  1584. already_tried.add(key)
  1585. if self._matches(original_markup, item, already_tried):
  1586. return True
  1587. else:
  1588. return False
  1589. # Beyond this point we might need to run the test twice: once against
  1590. # the tag's name and once against its prefixed name.
  1591. match = False
  1592. if not match and isinstance(match_against, str):
  1593. # Exact string match
  1594. match = markup == match_against
  1595. if not match and hasattr(match_against, 'search'):
  1596. # Regexp match
  1597. return match_against.search(markup)
  1598. if (not match
  1599. and isinstance(original_markup, Tag)
  1600. and original_markup.prefix):
  1601. # Try the whole thing again with the prefixed tag name.
  1602. return self._matches(
  1603. original_markup.prefix + ':' + original_markup.name, match_against
  1604. )
  1605. return match
  1606. class ResultSet(list):
  1607. """A ResultSet is just a list that keeps track of the SoupStrainer
  1608. that created it."""
  1609. def __init__(self, source, result=()):
  1610. super(ResultSet, self).__init__(result)
  1611. self.source = source
  1612. def __getattr__(self, key):
  1613. raise AttributeError(
  1614. "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
  1615. )