You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
3.7 KiB

4 years ago
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Communicator client code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 1998
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. #
  12. # This library is free software; you can redistribute it and/or
  13. # modify it under the terms of the GNU Lesser General Public
  14. # License as published by the Free Software Foundation; either
  15. # version 2.1 of the License, or (at your option) any later version.
  16. #
  17. # This library is distributed in the hope that it will be useful,
  18. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. # Lesser General Public License for more details.
  21. #
  22. # You should have received a copy of the GNU Lesser General Public
  23. # License along with this library; if not, write to the Free Software
  24. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  25. # 02110-1301 USA
  26. ######################### END LICENSE BLOCK #########################
  27. from .enums import ProbingState
  28. from .charsetprober import CharSetProber
  29. class CharSetGroupProber(CharSetProber):
  30. def __init__(self, lang_filter=None):
  31. super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
  32. self._active_num = 0
  33. self.probers = []
  34. self._best_guess_prober = None
  35. def reset(self):
  36. super(CharSetGroupProber, self).reset()
  37. self._active_num = 0
  38. for prober in self.probers:
  39. if prober:
  40. prober.reset()
  41. prober.active = True
  42. self._active_num += 1
  43. self._best_guess_prober = None
  44. @property
  45. def charset_name(self):
  46. if not self._best_guess_prober:
  47. self.get_confidence()
  48. if not self._best_guess_prober:
  49. return None
  50. return self._best_guess_prober.charset_name
  51. @property
  52. def language(self):
  53. if not self._best_guess_prober:
  54. self.get_confidence()
  55. if not self._best_guess_prober:
  56. return None
  57. return self._best_guess_prober.language
  58. def feed(self, byte_str):
  59. for prober in self.probers:
  60. if not prober:
  61. continue
  62. if not prober.active:
  63. continue
  64. state = prober.feed(byte_str)
  65. if not state:
  66. continue
  67. if state == ProbingState.FOUND_IT:
  68. self._best_guess_prober = prober
  69. return self.state
  70. elif state == ProbingState.NOT_ME:
  71. prober.active = False
  72. self._active_num -= 1
  73. if self._active_num <= 0:
  74. self._state = ProbingState.NOT_ME
  75. return self.state
  76. return self.state
  77. def get_confidence(self):
  78. state = self.state
  79. if state == ProbingState.FOUND_IT:
  80. return 0.99
  81. elif state == ProbingState.NOT_ME:
  82. return 0.01
  83. best_conf = 0.0
  84. self._best_guess_prober = None
  85. for prober in self.probers:
  86. if not prober:
  87. continue
  88. if not prober.active:
  89. self.logger.debug('%s not active', prober.charset_name)
  90. continue
  91. conf = prober.get_confidence()
  92. self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
  93. if best_conf < conf:
  94. best_conf = conf
  95. self._best_guess_prober = prober
  96. if not self._best_guess_prober:
  97. return 0.0
  98. return best_conf