You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1435 lines
50 KiB

4 years ago
  1. # Scanner produces tokens of the following types:
  2. # STREAM-START
  3. # STREAM-END
  4. # DIRECTIVE(name, value)
  5. # DOCUMENT-START
  6. # DOCUMENT-END
  7. # BLOCK-SEQUENCE-START
  8. # BLOCK-MAPPING-START
  9. # BLOCK-END
  10. # FLOW-SEQUENCE-START
  11. # FLOW-MAPPING-START
  12. # FLOW-SEQUENCE-END
  13. # FLOW-MAPPING-END
  14. # BLOCK-ENTRY
  15. # FLOW-ENTRY
  16. # KEY
  17. # VALUE
  18. # ALIAS(value)
  19. # ANCHOR(value)
  20. # TAG(value)
  21. # SCALAR(value, plain, style)
  22. #
  23. # Read comments in the Scanner code for more details.
  24. #
  25. __all__ = ['Scanner', 'ScannerError']
  26. from .error import MarkedYAMLError
  27. from .tokens import *
  28. class ScannerError(MarkedYAMLError):
  29. pass
  30. class SimpleKey:
  31. # See below simple keys treatment.
  32. def __init__(self, token_number, required, index, line, column, mark):
  33. self.token_number = token_number
  34. self.required = required
  35. self.index = index
  36. self.line = line
  37. self.column = column
  38. self.mark = mark
  39. class Scanner:
  40. def __init__(self):
  41. """Initialize the scanner."""
  42. # It is assumed that Scanner and Reader will have a common descendant.
  43. # Reader do the dirty work of checking for BOM and converting the
  44. # input data to Unicode. It also adds NUL to the end.
  45. #
  46. # Reader supports the following methods
  47. # self.peek(i=0) # peek the next i-th character
  48. # self.prefix(l=1) # peek the next l characters
  49. # self.forward(l=1) # read the next l characters and move the pointer.
  50. # Had we reached the end of the stream?
  51. self.done = False
  52. # The number of unclosed '{' and '['. `flow_level == 0` means block
  53. # context.
  54. self.flow_level = 0
  55. # List of processed tokens that are not yet emitted.
  56. self.tokens = []
  57. # Add the STREAM-START token.
  58. self.fetch_stream_start()
  59. # Number of tokens that were emitted through the `get_token` method.
  60. self.tokens_taken = 0
  61. # The current indentation level.
  62. self.indent = -1
  63. # Past indentation levels.
  64. self.indents = []
  65. # Variables related to simple keys treatment.
  66. # A simple key is a key that is not denoted by the '?' indicator.
  67. # Example of simple keys:
  68. # ---
  69. # block simple key: value
  70. # ? not a simple key:
  71. # : { flow simple key: value }
  72. # We emit the KEY token before all keys, so when we find a potential
  73. # simple key, we try to locate the corresponding ':' indicator.
  74. # Simple keys should be limited to a single line and 1024 characters.
  75. # Can a simple key start at the current position? A simple key may
  76. # start:
  77. # - at the beginning of the line, not counting indentation spaces
  78. # (in block context),
  79. # - after '{', '[', ',' (in the flow context),
  80. # - after '?', ':', '-' (in the block context).
  81. # In the block context, this flag also signifies if a block collection
  82. # may start at the current position.
  83. self.allow_simple_key = True
  84. # Keep track of possible simple keys. This is a dictionary. The key
  85. # is `flow_level`; there can be no more that one possible simple key
  86. # for each level. The value is a SimpleKey record:
  87. # (token_number, required, index, line, column, mark)
  88. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  89. # '[', or '{' tokens.
  90. self.possible_simple_keys = {}
  91. # Public methods.
  92. def check_token(self, *choices):
  93. # Check if the next token is one of the given types.
  94. while self.need_more_tokens():
  95. self.fetch_more_tokens()
  96. if self.tokens:
  97. if not choices:
  98. return True
  99. for choice in choices:
  100. if isinstance(self.tokens[0], choice):
  101. return True
  102. return False
  103. def peek_token(self):
  104. # Return the next token, but do not delete if from the queue.
  105. # Return None if no more tokens.
  106. while self.need_more_tokens():
  107. self.fetch_more_tokens()
  108. if self.tokens:
  109. return self.tokens[0]
  110. else:
  111. return None
  112. def get_token(self):
  113. # Return the next token.
  114. while self.need_more_tokens():
  115. self.fetch_more_tokens()
  116. if self.tokens:
  117. self.tokens_taken += 1
  118. return self.tokens.pop(0)
  119. # Private methods.
  120. def need_more_tokens(self):
  121. if self.done:
  122. return False
  123. if not self.tokens:
  124. return True
  125. # The current token may be a potential simple key, so we
  126. # need to look further.
  127. self.stale_possible_simple_keys()
  128. if self.next_possible_simple_key() == self.tokens_taken:
  129. return True
  130. def fetch_more_tokens(self):
  131. # Eat whitespaces and comments until we reach the next token.
  132. self.scan_to_next_token()
  133. # Remove obsolete possible simple keys.
  134. self.stale_possible_simple_keys()
  135. # Compare the current indentation and column. It may add some tokens
  136. # and decrease the current indentation level.
  137. self.unwind_indent(self.column)
  138. # Peek the next character.
  139. ch = self.peek()
  140. # Is it the end of stream?
  141. if ch == '\0':
  142. return self.fetch_stream_end()
  143. # Is it a directive?
  144. if ch == '%' and self.check_directive():
  145. return self.fetch_directive()
  146. # Is it the document start?
  147. if ch == '-' and self.check_document_start():
  148. return self.fetch_document_start()
  149. # Is it the document end?
  150. if ch == '.' and self.check_document_end():
  151. return self.fetch_document_end()
  152. # TODO: support for BOM within a stream.
  153. #if ch == '\uFEFF':
  154. # return self.fetch_bom() <-- issue BOMToken
  155. # Note: the order of the following checks is NOT significant.
  156. # Is it the flow sequence start indicator?
  157. if ch == '[':
  158. return self.fetch_flow_sequence_start()
  159. # Is it the flow mapping start indicator?
  160. if ch == '{':
  161. return self.fetch_flow_mapping_start()
  162. # Is it the flow sequence end indicator?
  163. if ch == ']':
  164. return self.fetch_flow_sequence_end()
  165. # Is it the flow mapping end indicator?
  166. if ch == '}':
  167. return self.fetch_flow_mapping_end()
  168. # Is it the flow entry indicator?
  169. if ch == ',':
  170. return self.fetch_flow_entry()
  171. # Is it the block entry indicator?
  172. if ch == '-' and self.check_block_entry():
  173. return self.fetch_block_entry()
  174. # Is it the key indicator?
  175. if ch == '?' and self.check_key():
  176. return self.fetch_key()
  177. # Is it the value indicator?
  178. if ch == ':' and self.check_value():
  179. return self.fetch_value()
  180. # Is it an alias?
  181. if ch == '*':
  182. return self.fetch_alias()
  183. # Is it an anchor?
  184. if ch == '&':
  185. return self.fetch_anchor()
  186. # Is it a tag?
  187. if ch == '!':
  188. return self.fetch_tag()
  189. # Is it a literal scalar?
  190. if ch == '|' and not self.flow_level:
  191. return self.fetch_literal()
  192. # Is it a folded scalar?
  193. if ch == '>' and not self.flow_level:
  194. return self.fetch_folded()
  195. # Is it a single quoted scalar?
  196. if ch == '\'':
  197. return self.fetch_single()
  198. # Is it a double quoted scalar?
  199. if ch == '\"':
  200. return self.fetch_double()
  201. # It must be a plain scalar then.
  202. if self.check_plain():
  203. return self.fetch_plain()
  204. # No? It's an error. Let's produce a nice error message.
  205. raise ScannerError("while scanning for the next token", None,
  206. "found character %r that cannot start any token" % ch,
  207. self.get_mark())
  208. # Simple keys treatment.
  209. def next_possible_simple_key(self):
  210. # Return the number of the nearest possible simple key. Actually we
  211. # don't need to loop through the whole dictionary. We may replace it
  212. # with the following code:
  213. # if not self.possible_simple_keys:
  214. # return None
  215. # return self.possible_simple_keys[
  216. # min(self.possible_simple_keys.keys())].token_number
  217. min_token_number = None
  218. for level in self.possible_simple_keys:
  219. key = self.possible_simple_keys[level]
  220. if min_token_number is None or key.token_number < min_token_number:
  221. min_token_number = key.token_number
  222. return min_token_number
  223. def stale_possible_simple_keys(self):
  224. # Remove entries that are no longer possible simple keys. According to
  225. # the YAML specification, simple keys
  226. # - should be limited to a single line,
  227. # - should be no longer than 1024 characters.
  228. # Disabling this procedure will allow simple keys of any length and
  229. # height (may cause problems if indentation is broken though).
  230. for level in list(self.possible_simple_keys):
  231. key = self.possible_simple_keys[level]
  232. if key.line != self.line \
  233. or self.index-key.index > 1024:
  234. if key.required:
  235. raise ScannerError("while scanning a simple key", key.mark,
  236. "could not find expected ':'", self.get_mark())
  237. del self.possible_simple_keys[level]
  238. def save_possible_simple_key(self):
  239. # The next token may start a simple key. We check if it's possible
  240. # and save its position. This function is called for
  241. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  242. # Check if a simple key is required at the current position.
  243. required = not self.flow_level and self.indent == self.column
  244. # The next token might be a simple key. Let's save it's number and
  245. # position.
  246. if self.allow_simple_key:
  247. self.remove_possible_simple_key()
  248. token_number = self.tokens_taken+len(self.tokens)
  249. key = SimpleKey(token_number, required,
  250. self.index, self.line, self.column, self.get_mark())
  251. self.possible_simple_keys[self.flow_level] = key
  252. def remove_possible_simple_key(self):
  253. # Remove the saved possible key position at the current flow level.
  254. if self.flow_level in self.possible_simple_keys:
  255. key = self.possible_simple_keys[self.flow_level]
  256. if key.required:
  257. raise ScannerError("while scanning a simple key", key.mark,
  258. "could not find expected ':'", self.get_mark())
  259. del self.possible_simple_keys[self.flow_level]
  260. # Indentation functions.
  261. def unwind_indent(self, column):
  262. ## In flow context, tokens should respect indentation.
  263. ## Actually the condition should be `self.indent >= column` according to
  264. ## the spec. But this condition will prohibit intuitively correct
  265. ## constructions such as
  266. ## key : {
  267. ## }
  268. #if self.flow_level and self.indent > column:
  269. # raise ScannerError(None, None,
  270. # "invalid indentation or unclosed '[' or '{'",
  271. # self.get_mark())
  272. # In the flow context, indentation is ignored. We make the scanner less
  273. # restrictive then specification requires.
  274. if self.flow_level:
  275. return
  276. # In block context, we may need to issue the BLOCK-END tokens.
  277. while self.indent > column:
  278. mark = self.get_mark()
  279. self.indent = self.indents.pop()
  280. self.tokens.append(BlockEndToken(mark, mark))
  281. def add_indent(self, column):
  282. # Check if we need to increase indentation.
  283. if self.indent < column:
  284. self.indents.append(self.indent)
  285. self.indent = column
  286. return True
  287. return False
  288. # Fetchers.
  289. def fetch_stream_start(self):
  290. # We always add STREAM-START as the first token and STREAM-END as the
  291. # last token.
  292. # Read the token.
  293. mark = self.get_mark()
  294. # Add STREAM-START.
  295. self.tokens.append(StreamStartToken(mark, mark,
  296. encoding=self.encoding))
  297. def fetch_stream_end(self):
  298. # Set the current indentation to -1.
  299. self.unwind_indent(-1)
  300. # Reset simple keys.
  301. self.remove_possible_simple_key()
  302. self.allow_simple_key = False
  303. self.possible_simple_keys = {}
  304. # Read the token.
  305. mark = self.get_mark()
  306. # Add STREAM-END.
  307. self.tokens.append(StreamEndToken(mark, mark))
  308. # The steam is finished.
  309. self.done = True
  310. def fetch_directive(self):
  311. # Set the current indentation to -1.
  312. self.unwind_indent(-1)
  313. # Reset simple keys.
  314. self.remove_possible_simple_key()
  315. self.allow_simple_key = False
  316. # Scan and add DIRECTIVE.
  317. self.tokens.append(self.scan_directive())
  318. def fetch_document_start(self):
  319. self.fetch_document_indicator(DocumentStartToken)
  320. def fetch_document_end(self):
  321. self.fetch_document_indicator(DocumentEndToken)
  322. def fetch_document_indicator(self, TokenClass):
  323. # Set the current indentation to -1.
  324. self.unwind_indent(-1)
  325. # Reset simple keys. Note that there could not be a block collection
  326. # after '---'.
  327. self.remove_possible_simple_key()
  328. self.allow_simple_key = False
  329. # Add DOCUMENT-START or DOCUMENT-END.
  330. start_mark = self.get_mark()
  331. self.forward(3)
  332. end_mark = self.get_mark()
  333. self.tokens.append(TokenClass(start_mark, end_mark))
  334. def fetch_flow_sequence_start(self):
  335. self.fetch_flow_collection_start(FlowSequenceStartToken)
  336. def fetch_flow_mapping_start(self):
  337. self.fetch_flow_collection_start(FlowMappingStartToken)
  338. def fetch_flow_collection_start(self, TokenClass):
  339. # '[' and '{' may start a simple key.
  340. self.save_possible_simple_key()
  341. # Increase the flow level.
  342. self.flow_level += 1
  343. # Simple keys are allowed after '[' and '{'.
  344. self.allow_simple_key = True
  345. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  346. start_mark = self.get_mark()
  347. self.forward()
  348. end_mark = self.get_mark()
  349. self.tokens.append(TokenClass(start_mark, end_mark))
  350. def fetch_flow_sequence_end(self):
  351. self.fetch_flow_collection_end(FlowSequenceEndToken)
  352. def fetch_flow_mapping_end(self):
  353. self.fetch_flow_collection_end(FlowMappingEndToken)
  354. def fetch_flow_collection_end(self, TokenClass):
  355. # Reset possible simple key on the current level.
  356. self.remove_possible_simple_key()
  357. # Decrease the flow level.
  358. self.flow_level -= 1
  359. # No simple keys after ']' or '}'.
  360. self.allow_simple_key = False
  361. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  362. start_mark = self.get_mark()
  363. self.forward()
  364. end_mark = self.get_mark()
  365. self.tokens.append(TokenClass(start_mark, end_mark))
  366. def fetch_flow_entry(self):
  367. # Simple keys are allowed after ','.
  368. self.allow_simple_key = True
  369. # Reset possible simple key on the current level.
  370. self.remove_possible_simple_key()
  371. # Add FLOW-ENTRY.
  372. start_mark = self.get_mark()
  373. self.forward()
  374. end_mark = self.get_mark()
  375. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  376. def fetch_block_entry(self):
  377. # Block context needs additional checks.
  378. if not self.flow_level:
  379. # Are we allowed to start a new entry?
  380. if not self.allow_simple_key:
  381. raise ScannerError(None, None,
  382. "sequence entries are not allowed here",
  383. self.get_mark())
  384. # We may need to add BLOCK-SEQUENCE-START.
  385. if self.add_indent(self.column):
  386. mark = self.get_mark()
  387. self.tokens.append(BlockSequenceStartToken(mark, mark))
  388. # It's an error for the block entry to occur in the flow context,
  389. # but we let the parser detect this.
  390. else:
  391. pass
  392. # Simple keys are allowed after '-'.
  393. self.allow_simple_key = True
  394. # Reset possible simple key on the current level.
  395. self.remove_possible_simple_key()
  396. # Add BLOCK-ENTRY.
  397. start_mark = self.get_mark()
  398. self.forward()
  399. end_mark = self.get_mark()
  400. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  401. def fetch_key(self):
  402. # Block context needs additional checks.
  403. if not self.flow_level:
  404. # Are we allowed to start a key (not necessary a simple)?
  405. if not self.allow_simple_key:
  406. raise ScannerError(None, None,
  407. "mapping keys are not allowed here",
  408. self.get_mark())
  409. # We may need to add BLOCK-MAPPING-START.
  410. if self.add_indent(self.column):
  411. mark = self.get_mark()
  412. self.tokens.append(BlockMappingStartToken(mark, mark))
  413. # Simple keys are allowed after '?' in the block context.
  414. self.allow_simple_key = not self.flow_level
  415. # Reset possible simple key on the current level.
  416. self.remove_possible_simple_key()
  417. # Add KEY.
  418. start_mark = self.get_mark()
  419. self.forward()
  420. end_mark = self.get_mark()
  421. self.tokens.append(KeyToken(start_mark, end_mark))
  422. def fetch_value(self):
  423. # Do we determine a simple key?
  424. if self.flow_level in self.possible_simple_keys:
  425. # Add KEY.
  426. key = self.possible_simple_keys[self.flow_level]
  427. del self.possible_simple_keys[self.flow_level]
  428. self.tokens.insert(key.token_number-self.tokens_taken,
  429. KeyToken(key.mark, key.mark))
  430. # If this key starts a new block mapping, we need to add
  431. # BLOCK-MAPPING-START.
  432. if not self.flow_level:
  433. if self.add_indent(key.column):
  434. self.tokens.insert(key.token_number-self.tokens_taken,
  435. BlockMappingStartToken(key.mark, key.mark))
  436. # There cannot be two simple keys one after another.
  437. self.allow_simple_key = False
  438. # It must be a part of a complex key.
  439. else:
  440. # Block context needs additional checks.
  441. # (Do we really need them? They will be caught by the parser
  442. # anyway.)
  443. if not self.flow_level:
  444. # We are allowed to start a complex value if and only if
  445. # we can start a simple key.
  446. if not self.allow_simple_key:
  447. raise ScannerError(None, None,
  448. "mapping values are not allowed here",
  449. self.get_mark())
  450. # If this value starts a new block mapping, we need to add
  451. # BLOCK-MAPPING-START. It will be detected as an error later by
  452. # the parser.
  453. if not self.flow_level:
  454. if self.add_indent(self.column):
  455. mark = self.get_mark()
  456. self.tokens.append(BlockMappingStartToken(mark, mark))
  457. # Simple keys are allowed after ':' in the block context.
  458. self.allow_simple_key = not self.flow_level
  459. # Reset possible simple key on the current level.
  460. self.remove_possible_simple_key()
  461. # Add VALUE.
  462. start_mark = self.get_mark()
  463. self.forward()
  464. end_mark = self.get_mark()
  465. self.tokens.append(ValueToken(start_mark, end_mark))
  466. def fetch_alias(self):
  467. # ALIAS could be a simple key.
  468. self.save_possible_simple_key()
  469. # No simple keys after ALIAS.
  470. self.allow_simple_key = False
  471. # Scan and add ALIAS.
  472. self.tokens.append(self.scan_anchor(AliasToken))
  473. def fetch_anchor(self):
  474. # ANCHOR could start a simple key.
  475. self.save_possible_simple_key()
  476. # No simple keys after ANCHOR.
  477. self.allow_simple_key = False
  478. # Scan and add ANCHOR.
  479. self.tokens.append(self.scan_anchor(AnchorToken))
  480. def fetch_tag(self):
  481. # TAG could start a simple key.
  482. self.save_possible_simple_key()
  483. # No simple keys after TAG.
  484. self.allow_simple_key = False
  485. # Scan and add TAG.
  486. self.tokens.append(self.scan_tag())
  487. def fetch_literal(self):
  488. self.fetch_block_scalar(style='|')
  489. def fetch_folded(self):
  490. self.fetch_block_scalar(style='>')
  491. def fetch_block_scalar(self, style):
  492. # A simple key may follow a block scalar.
  493. self.allow_simple_key = True
  494. # Reset possible simple key on the current level.
  495. self.remove_possible_simple_key()
  496. # Scan and add SCALAR.
  497. self.tokens.append(self.scan_block_scalar(style))
  498. def fetch_single(self):
  499. self.fetch_flow_scalar(style='\'')
  500. def fetch_double(self):
  501. self.fetch_flow_scalar(style='"')
  502. def fetch_flow_scalar(self, style):
  503. # A flow scalar could be a simple key.
  504. self.save_possible_simple_key()
  505. # No simple keys after flow scalars.
  506. self.allow_simple_key = False
  507. # Scan and add SCALAR.
  508. self.tokens.append(self.scan_flow_scalar(style))
  509. def fetch_plain(self):
  510. # A plain scalar could be a simple key.
  511. self.save_possible_simple_key()
  512. # No simple keys after plain scalars. But note that `scan_plain` will
  513. # change this flag if the scan is finished at the beginning of the
  514. # line.
  515. self.allow_simple_key = False
  516. # Scan and add SCALAR. May change `allow_simple_key`.
  517. self.tokens.append(self.scan_plain())
  518. # Checkers.
  519. def check_directive(self):
  520. # DIRECTIVE: ^ '%' ...
  521. # The '%' indicator is already checked.
  522. if self.column == 0:
  523. return True
  524. def check_document_start(self):
  525. # DOCUMENT-START: ^ '---' (' '|'\n')
  526. if self.column == 0:
  527. if self.prefix(3) == '---' \
  528. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  529. return True
  530. def check_document_end(self):
  531. # DOCUMENT-END: ^ '...' (' '|'\n')
  532. if self.column == 0:
  533. if self.prefix(3) == '...' \
  534. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  535. return True
  536. def check_block_entry(self):
  537. # BLOCK-ENTRY: '-' (' '|'\n')
  538. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  539. def check_key(self):
  540. # KEY(flow context): '?'
  541. if self.flow_level:
  542. return True
  543. # KEY(block context): '?' (' '|'\n')
  544. else:
  545. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  546. def check_value(self):
  547. # VALUE(flow context): ':'
  548. if self.flow_level:
  549. return True
  550. # VALUE(block context): ':' (' '|'\n')
  551. else:
  552. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  553. def check_plain(self):
  554. # A plain scalar may start with any non-space character except:
  555. # '-', '?', ':', ',', '[', ']', '{', '}',
  556. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  557. # '%', '@', '`'.
  558. #
  559. # It may also start with
  560. # '-', '?', ':'
  561. # if it is followed by a non-space character.
  562. #
  563. # Note that we limit the last rule to the block context (except the
  564. # '-' character) because we want the flow context to be space
  565. # independent.
  566. ch = self.peek()
  567. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
  568. or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
  569. and (ch == '-' or (not self.flow_level and ch in '?:')))
  570. # Scanners.
  571. def scan_to_next_token(self):
  572. # We ignore spaces, line breaks and comments.
  573. # If we find a line break in the block context, we set the flag
  574. # `allow_simple_key` on.
  575. # The byte order mark is stripped if it's the first character in the
  576. # stream. We do not yet support BOM inside the stream as the
  577. # specification requires. Any such mark will be considered as a part
  578. # of the document.
  579. #
  580. # TODO: We need to make tab handling rules more sane. A good rule is
  581. # Tabs cannot precede tokens
  582. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  583. # KEY(block), VALUE(block), BLOCK-ENTRY
  584. # So the checking code is
  585. # if <TAB>:
  586. # self.allow_simple_keys = False
  587. # We also need to add the check for `allow_simple_keys == True` to
  588. # `unwind_indent` before issuing BLOCK-END.
  589. # Scanners for block, flow, and plain scalars need to be modified.
  590. if self.index == 0 and self.peek() == '\uFEFF':
  591. self.forward()
  592. found = False
  593. while not found:
  594. while self.peek() == ' ':
  595. self.forward()
  596. if self.peek() == '#':
  597. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  598. self.forward()
  599. if self.scan_line_break():
  600. if not self.flow_level:
  601. self.allow_simple_key = True
  602. else:
  603. found = True
  604. def scan_directive(self):
  605. # See the specification for details.
  606. start_mark = self.get_mark()
  607. self.forward()
  608. name = self.scan_directive_name(start_mark)
  609. value = None
  610. if name == 'YAML':
  611. value = self.scan_yaml_directive_value(start_mark)
  612. end_mark = self.get_mark()
  613. elif name == 'TAG':
  614. value = self.scan_tag_directive_value(start_mark)
  615. end_mark = self.get_mark()
  616. else:
  617. end_mark = self.get_mark()
  618. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  619. self.forward()
  620. self.scan_directive_ignored_line(start_mark)
  621. return DirectiveToken(name, value, start_mark, end_mark)
  622. def scan_directive_name(self, start_mark):
  623. # See the specification for details.
  624. length = 0
  625. ch = self.peek(length)
  626. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  627. or ch in '-_':
  628. length += 1
  629. ch = self.peek(length)
  630. if not length:
  631. raise ScannerError("while scanning a directive", start_mark,
  632. "expected alphabetic or numeric character, but found %r"
  633. % ch, self.get_mark())
  634. value = self.prefix(length)
  635. self.forward(length)
  636. ch = self.peek()
  637. if ch not in '\0 \r\n\x85\u2028\u2029':
  638. raise ScannerError("while scanning a directive", start_mark,
  639. "expected alphabetic or numeric character, but found %r"
  640. % ch, self.get_mark())
  641. return value
  642. def scan_yaml_directive_value(self, start_mark):
  643. # See the specification for details.
  644. while self.peek() == ' ':
  645. self.forward()
  646. major = self.scan_yaml_directive_number(start_mark)
  647. if self.peek() != '.':
  648. raise ScannerError("while scanning a directive", start_mark,
  649. "expected a digit or '.', but found %r" % self.peek(),
  650. self.get_mark())
  651. self.forward()
  652. minor = self.scan_yaml_directive_number(start_mark)
  653. if self.peek() not in '\0 \r\n\x85\u2028\u2029':
  654. raise ScannerError("while scanning a directive", start_mark,
  655. "expected a digit or ' ', but found %r" % self.peek(),
  656. self.get_mark())
  657. return (major, minor)
  658. def scan_yaml_directive_number(self, start_mark):
  659. # See the specification for details.
  660. ch = self.peek()
  661. if not ('0' <= ch <= '9'):
  662. raise ScannerError("while scanning a directive", start_mark,
  663. "expected a digit, but found %r" % ch, self.get_mark())
  664. length = 0
  665. while '0' <= self.peek(length) <= '9':
  666. length += 1
  667. value = int(self.prefix(length))
  668. self.forward(length)
  669. return value
  670. def scan_tag_directive_value(self, start_mark):
  671. # See the specification for details.
  672. while self.peek() == ' ':
  673. self.forward()
  674. handle = self.scan_tag_directive_handle(start_mark)
  675. while self.peek() == ' ':
  676. self.forward()
  677. prefix = self.scan_tag_directive_prefix(start_mark)
  678. return (handle, prefix)
  679. def scan_tag_directive_handle(self, start_mark):
  680. # See the specification for details.
  681. value = self.scan_tag_handle('directive', start_mark)
  682. ch = self.peek()
  683. if ch != ' ':
  684. raise ScannerError("while scanning a directive", start_mark,
  685. "expected ' ', but found %r" % ch, self.get_mark())
  686. return value
  687. def scan_tag_directive_prefix(self, start_mark):
  688. # See the specification for details.
  689. value = self.scan_tag_uri('directive', start_mark)
  690. ch = self.peek()
  691. if ch not in '\0 \r\n\x85\u2028\u2029':
  692. raise ScannerError("while scanning a directive", start_mark,
  693. "expected ' ', but found %r" % ch, self.get_mark())
  694. return value
  695. def scan_directive_ignored_line(self, start_mark):
  696. # See the specification for details.
  697. while self.peek() == ' ':
  698. self.forward()
  699. if self.peek() == '#':
  700. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  701. self.forward()
  702. ch = self.peek()
  703. if ch not in '\0\r\n\x85\u2028\u2029':
  704. raise ScannerError("while scanning a directive", start_mark,
  705. "expected a comment or a line break, but found %r"
  706. % ch, self.get_mark())
  707. self.scan_line_break()
  708. def scan_anchor(self, TokenClass):
  709. # The specification does not restrict characters for anchors and
  710. # aliases. This may lead to problems, for instance, the document:
  711. # [ *alias, value ]
  712. # can be interpreted in two ways, as
  713. # [ "value" ]
  714. # and
  715. # [ *alias , "value" ]
  716. # Therefore we restrict aliases to numbers and ASCII letters.
  717. start_mark = self.get_mark()
  718. indicator = self.peek()
  719. if indicator == '*':
  720. name = 'alias'
  721. else:
  722. name = 'anchor'
  723. self.forward()
  724. length = 0
  725. ch = self.peek(length)
  726. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  727. or ch in '-_':
  728. length += 1
  729. ch = self.peek(length)
  730. if not length:
  731. raise ScannerError("while scanning an %s" % name, start_mark,
  732. "expected alphabetic or numeric character, but found %r"
  733. % ch, self.get_mark())
  734. value = self.prefix(length)
  735. self.forward(length)
  736. ch = self.peek()
  737. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
  738. raise ScannerError("while scanning an %s" % name, start_mark,
  739. "expected alphabetic or numeric character, but found %r"
  740. % ch, self.get_mark())
  741. end_mark = self.get_mark()
  742. return TokenClass(value, start_mark, end_mark)
  743. def scan_tag(self):
  744. # See the specification for details.
  745. start_mark = self.get_mark()
  746. ch = self.peek(1)
  747. if ch == '<':
  748. handle = None
  749. self.forward(2)
  750. suffix = self.scan_tag_uri('tag', start_mark)
  751. if self.peek() != '>':
  752. raise ScannerError("while parsing a tag", start_mark,
  753. "expected '>', but found %r" % self.peek(),
  754. self.get_mark())
  755. self.forward()
  756. elif ch in '\0 \t\r\n\x85\u2028\u2029':
  757. handle = None
  758. suffix = '!'
  759. self.forward()
  760. else:
  761. length = 1
  762. use_handle = False
  763. while ch not in '\0 \r\n\x85\u2028\u2029':
  764. if ch == '!':
  765. use_handle = True
  766. break
  767. length += 1
  768. ch = self.peek(length)
  769. handle = '!'
  770. if use_handle:
  771. handle = self.scan_tag_handle('tag', start_mark)
  772. else:
  773. handle = '!'
  774. self.forward()
  775. suffix = self.scan_tag_uri('tag', start_mark)
  776. ch = self.peek()
  777. if ch not in '\0 \r\n\x85\u2028\u2029':
  778. raise ScannerError("while scanning a tag", start_mark,
  779. "expected ' ', but found %r" % ch, self.get_mark())
  780. value = (handle, suffix)
  781. end_mark = self.get_mark()
  782. return TagToken(value, start_mark, end_mark)
  783. def scan_block_scalar(self, style):
  784. # See the specification for details.
  785. if style == '>':
  786. folded = True
  787. else:
  788. folded = False
  789. chunks = []
  790. start_mark = self.get_mark()
  791. # Scan the header.
  792. self.forward()
  793. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  794. self.scan_block_scalar_ignored_line(start_mark)
  795. # Determine the indentation level and go to the first non-empty line.
  796. min_indent = self.indent+1
  797. if min_indent < 1:
  798. min_indent = 1
  799. if increment is None:
  800. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  801. indent = max(min_indent, max_indent)
  802. else:
  803. indent = min_indent+increment-1
  804. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  805. line_break = ''
  806. # Scan the inner part of the block scalar.
  807. while self.column == indent and self.peek() != '\0':
  808. chunks.extend(breaks)
  809. leading_non_space = self.peek() not in ' \t'
  810. length = 0
  811. while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
  812. length += 1
  813. chunks.append(self.prefix(length))
  814. self.forward(length)
  815. line_break = self.scan_line_break()
  816. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  817. if self.column == indent and self.peek() != '\0':
  818. # Unfortunately, folding rules are ambiguous.
  819. #
  820. # This is the folding according to the specification:
  821. if folded and line_break == '\n' \
  822. and leading_non_space and self.peek() not in ' \t':
  823. if not breaks:
  824. chunks.append(' ')
  825. else:
  826. chunks.append(line_break)
  827. # This is Clark Evans's interpretation (also in the spec
  828. # examples):
  829. #
  830. #if folded and line_break == '\n':
  831. # if not breaks:
  832. # if self.peek() not in ' \t':
  833. # chunks.append(' ')
  834. # else:
  835. # chunks.append(line_break)
  836. #else:
  837. # chunks.append(line_break)
  838. else:
  839. break
  840. # Chomp the tail.
  841. if chomping is not False:
  842. chunks.append(line_break)
  843. if chomping is True:
  844. chunks.extend(breaks)
  845. # We are done.
  846. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  847. style)
  848. def scan_block_scalar_indicators(self, start_mark):
  849. # See the specification for details.
  850. chomping = None
  851. increment = None
  852. ch = self.peek()
  853. if ch in '+-':
  854. if ch == '+':
  855. chomping = True
  856. else:
  857. chomping = False
  858. self.forward()
  859. ch = self.peek()
  860. if ch in '0123456789':
  861. increment = int(ch)
  862. if increment == 0:
  863. raise ScannerError("while scanning a block scalar", start_mark,
  864. "expected indentation indicator in the range 1-9, but found 0",
  865. self.get_mark())
  866. self.forward()
  867. elif ch in '0123456789':
  868. increment = int(ch)
  869. if increment == 0:
  870. raise ScannerError("while scanning a block scalar", start_mark,
  871. "expected indentation indicator in the range 1-9, but found 0",
  872. self.get_mark())
  873. self.forward()
  874. ch = self.peek()
  875. if ch in '+-':
  876. if ch == '+':
  877. chomping = True
  878. else:
  879. chomping = False
  880. self.forward()
  881. ch = self.peek()
  882. if ch not in '\0 \r\n\x85\u2028\u2029':
  883. raise ScannerError("while scanning a block scalar", start_mark,
  884. "expected chomping or indentation indicators, but found %r"
  885. % ch, self.get_mark())
  886. return chomping, increment
  887. def scan_block_scalar_ignored_line(self, start_mark):
  888. # See the specification for details.
  889. while self.peek() == ' ':
  890. self.forward()
  891. if self.peek() == '#':
  892. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  893. self.forward()
  894. ch = self.peek()
  895. if ch not in '\0\r\n\x85\u2028\u2029':
  896. raise ScannerError("while scanning a block scalar", start_mark,
  897. "expected a comment or a line break, but found %r" % ch,
  898. self.get_mark())
  899. self.scan_line_break()
  900. def scan_block_scalar_indentation(self):
  901. # See the specification for details.
  902. chunks = []
  903. max_indent = 0
  904. end_mark = self.get_mark()
  905. while self.peek() in ' \r\n\x85\u2028\u2029':
  906. if self.peek() != ' ':
  907. chunks.append(self.scan_line_break())
  908. end_mark = self.get_mark()
  909. else:
  910. self.forward()
  911. if self.column > max_indent:
  912. max_indent = self.column
  913. return chunks, max_indent, end_mark
  914. def scan_block_scalar_breaks(self, indent):
  915. # See the specification for details.
  916. chunks = []
  917. end_mark = self.get_mark()
  918. while self.column < indent and self.peek() == ' ':
  919. self.forward()
  920. while self.peek() in '\r\n\x85\u2028\u2029':
  921. chunks.append(self.scan_line_break())
  922. end_mark = self.get_mark()
  923. while self.column < indent and self.peek() == ' ':
  924. self.forward()
  925. return chunks, end_mark
  926. def scan_flow_scalar(self, style):
  927. # See the specification for details.
  928. # Note that we loose indentation rules for quoted scalars. Quoted
  929. # scalars don't need to adhere indentation because " and ' clearly
  930. # mark the beginning and the end of them. Therefore we are less
  931. # restrictive then the specification requires. We only need to check
  932. # that document separators are not included in scalars.
  933. if style == '"':
  934. double = True
  935. else:
  936. double = False
  937. chunks = []
  938. start_mark = self.get_mark()
  939. quote = self.peek()
  940. self.forward()
  941. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  942. while self.peek() != quote:
  943. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  944. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  945. self.forward()
  946. end_mark = self.get_mark()
  947. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  948. style)
  949. ESCAPE_REPLACEMENTS = {
  950. '0': '\0',
  951. 'a': '\x07',
  952. 'b': '\x08',
  953. 't': '\x09',
  954. '\t': '\x09',
  955. 'n': '\x0A',
  956. 'v': '\x0B',
  957. 'f': '\x0C',
  958. 'r': '\x0D',
  959. 'e': '\x1B',
  960. ' ': '\x20',
  961. '\"': '\"',
  962. '\\': '\\',
  963. '/': '/',
  964. 'N': '\x85',
  965. '_': '\xA0',
  966. 'L': '\u2028',
  967. 'P': '\u2029',
  968. }
  969. ESCAPE_CODES = {
  970. 'x': 2,
  971. 'u': 4,
  972. 'U': 8,
  973. }
  974. def scan_flow_scalar_non_spaces(self, double, start_mark):
  975. # See the specification for details.
  976. chunks = []
  977. while True:
  978. length = 0
  979. while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
  980. length += 1
  981. if length:
  982. chunks.append(self.prefix(length))
  983. self.forward(length)
  984. ch = self.peek()
  985. if not double and ch == '\'' and self.peek(1) == '\'':
  986. chunks.append('\'')
  987. self.forward(2)
  988. elif (double and ch == '\'') or (not double and ch in '\"\\'):
  989. chunks.append(ch)
  990. self.forward()
  991. elif double and ch == '\\':
  992. self.forward()
  993. ch = self.peek()
  994. if ch in self.ESCAPE_REPLACEMENTS:
  995. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  996. self.forward()
  997. elif ch in self.ESCAPE_CODES:
  998. length = self.ESCAPE_CODES[ch]
  999. self.forward()
  1000. for k in range(length):
  1001. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1002. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1003. "expected escape sequence of %d hexdecimal numbers, but found %r" %
  1004. (length, self.peek(k)), self.get_mark())
  1005. code = int(self.prefix(length), 16)
  1006. chunks.append(chr(code))
  1007. self.forward(length)
  1008. elif ch in '\r\n\x85\u2028\u2029':
  1009. self.scan_line_break()
  1010. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1011. else:
  1012. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1013. "found unknown escape character %r" % ch, self.get_mark())
  1014. else:
  1015. return chunks
  1016. def scan_flow_scalar_spaces(self, double, start_mark):
  1017. # See the specification for details.
  1018. chunks = []
  1019. length = 0
  1020. while self.peek(length) in ' \t':
  1021. length += 1
  1022. whitespaces = self.prefix(length)
  1023. self.forward(length)
  1024. ch = self.peek()
  1025. if ch == '\0':
  1026. raise ScannerError("while scanning a quoted scalar", start_mark,
  1027. "found unexpected end of stream", self.get_mark())
  1028. elif ch in '\r\n\x85\u2028\u2029':
  1029. line_break = self.scan_line_break()
  1030. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1031. if line_break != '\n':
  1032. chunks.append(line_break)
  1033. elif not breaks:
  1034. chunks.append(' ')
  1035. chunks.extend(breaks)
  1036. else:
  1037. chunks.append(whitespaces)
  1038. return chunks
  1039. def scan_flow_scalar_breaks(self, double, start_mark):
  1040. # See the specification for details.
  1041. chunks = []
  1042. while True:
  1043. # Instead of checking indentation, we check for document
  1044. # separators.
  1045. prefix = self.prefix(3)
  1046. if (prefix == '---' or prefix == '...') \
  1047. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1048. raise ScannerError("while scanning a quoted scalar", start_mark,
  1049. "found unexpected document separator", self.get_mark())
  1050. while self.peek() in ' \t':
  1051. self.forward()
  1052. if self.peek() in '\r\n\x85\u2028\u2029':
  1053. chunks.append(self.scan_line_break())
  1054. else:
  1055. return chunks
  1056. def scan_plain(self):
  1057. # See the specification for details.
  1058. # We add an additional restriction for the flow context:
  1059. # plain scalars in the flow context cannot contain ',' or '?'.
  1060. # We also keep track of the `allow_simple_key` flag here.
  1061. # Indentation rules are loosed for the flow context.
  1062. chunks = []
  1063. start_mark = self.get_mark()
  1064. end_mark = start_mark
  1065. indent = self.indent+1
  1066. # We allow zero indentation for scalars, but then we need to check for
  1067. # document separators at the beginning of the line.
  1068. #if indent == 0:
  1069. # indent = 1
  1070. spaces = []
  1071. while True:
  1072. length = 0
  1073. if self.peek() == '#':
  1074. break
  1075. while True:
  1076. ch = self.peek(length)
  1077. if ch in '\0 \t\r\n\x85\u2028\u2029' \
  1078. or (ch == ':' and
  1079. self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029'
  1080. + (u',[]{}' if self.flow_level else u''))\
  1081. or (self.flow_level and ch in ',?[]{}'):
  1082. break
  1083. length += 1
  1084. if length == 0:
  1085. break
  1086. self.allow_simple_key = False
  1087. chunks.extend(spaces)
  1088. chunks.append(self.prefix(length))
  1089. self.forward(length)
  1090. end_mark = self.get_mark()
  1091. spaces = self.scan_plain_spaces(indent, start_mark)
  1092. if not spaces or self.peek() == '#' \
  1093. or (not self.flow_level and self.column < indent):
  1094. break
  1095. return ScalarToken(''.join(chunks), True, start_mark, end_mark)
  1096. def scan_plain_spaces(self, indent, start_mark):
  1097. # See the specification for details.
  1098. # The specification is really confusing about tabs in plain scalars.
  1099. # We just forbid them completely. Do not use tabs in YAML!
  1100. chunks = []
  1101. length = 0
  1102. while self.peek(length) in ' ':
  1103. length += 1
  1104. whitespaces = self.prefix(length)
  1105. self.forward(length)
  1106. ch = self.peek()
  1107. if ch in '\r\n\x85\u2028\u2029':
  1108. line_break = self.scan_line_break()
  1109. self.allow_simple_key = True
  1110. prefix = self.prefix(3)
  1111. if (prefix == '---' or prefix == '...') \
  1112. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1113. return
  1114. breaks = []
  1115. while self.peek() in ' \r\n\x85\u2028\u2029':
  1116. if self.peek() == ' ':
  1117. self.forward()
  1118. else:
  1119. breaks.append(self.scan_line_break())
  1120. prefix = self.prefix(3)
  1121. if (prefix == '---' or prefix == '...') \
  1122. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1123. return
  1124. if line_break != '\n':
  1125. chunks.append(line_break)
  1126. elif not breaks:
  1127. chunks.append(' ')
  1128. chunks.extend(breaks)
  1129. elif whitespaces:
  1130. chunks.append(whitespaces)
  1131. return chunks
  1132. def scan_tag_handle(self, name, start_mark):
  1133. # See the specification for details.
  1134. # For some strange reasons, the specification does not allow '_' in
  1135. # tag handles. I have allowed it anyway.
  1136. ch = self.peek()
  1137. if ch != '!':
  1138. raise ScannerError("while scanning a %s" % name, start_mark,
  1139. "expected '!', but found %r" % ch, self.get_mark())
  1140. length = 1
  1141. ch = self.peek(length)
  1142. if ch != ' ':
  1143. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1144. or ch in '-_':
  1145. length += 1
  1146. ch = self.peek(length)
  1147. if ch != '!':
  1148. self.forward(length)
  1149. raise ScannerError("while scanning a %s" % name, start_mark,
  1150. "expected '!', but found %r" % ch, self.get_mark())
  1151. length += 1
  1152. value = self.prefix(length)
  1153. self.forward(length)
  1154. return value
  1155. def scan_tag_uri(self, name, start_mark):
  1156. # See the specification for details.
  1157. # Note: we do not check if URI is well-formed.
  1158. chunks = []
  1159. length = 0
  1160. ch = self.peek(length)
  1161. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1162. or ch in '-;/?:@&=+$,_.!~*\'()[]%':
  1163. if ch == '%':
  1164. chunks.append(self.prefix(length))
  1165. self.forward(length)
  1166. length = 0
  1167. chunks.append(self.scan_uri_escapes(name, start_mark))
  1168. else:
  1169. length += 1
  1170. ch = self.peek(length)
  1171. if length:
  1172. chunks.append(self.prefix(length))
  1173. self.forward(length)
  1174. length = 0
  1175. if not chunks:
  1176. raise ScannerError("while parsing a %s" % name, start_mark,
  1177. "expected URI, but found %r" % ch, self.get_mark())
  1178. return ''.join(chunks)
  1179. def scan_uri_escapes(self, name, start_mark):
  1180. # See the specification for details.
  1181. codes = []
  1182. mark = self.get_mark()
  1183. while self.peek() == '%':
  1184. self.forward()
  1185. for k in range(2):
  1186. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1187. raise ScannerError("while scanning a %s" % name, start_mark,
  1188. "expected URI escape sequence of 2 hexdecimal numbers, but found %r"
  1189. % self.peek(k), self.get_mark())
  1190. codes.append(int(self.prefix(2), 16))
  1191. self.forward(2)
  1192. try:
  1193. value = bytes(codes).decode('utf-8')
  1194. except UnicodeDecodeError as exc:
  1195. raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
  1196. return value
  1197. def scan_line_break(self):
  1198. # Transforms:
  1199. # '\r\n' : '\n'
  1200. # '\r' : '\n'
  1201. # '\n' : '\n'
  1202. # '\x85' : '\n'
  1203. # '\u2028' : '\u2028'
  1204. # '\u2029 : '\u2029'
  1205. # default : ''
  1206. ch = self.peek()
  1207. if ch in '\r\n\x85':
  1208. if self.prefix(2) == '\r\n':
  1209. self.forward(2)
  1210. else:
  1211. self.forward()
  1212. return '\n'
  1213. elif ch in '\u2028\u2029':
  1214. self.forward()
  1215. return ch
  1216. return ''