You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

73 lines
3.5 KiB

4 years ago
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Universal charset detector code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 2001
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. # Shy Shalom - original C code
  12. #
  13. # This library is free software; you can redistribute it and/or
  14. # modify it under the terms of the GNU Lesser General Public
  15. # License as published by the Free Software Foundation; either
  16. # version 2.1 of the License, or (at your option) any later version.
  17. #
  18. # This library is distributed in the hope that it will be useful,
  19. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. # Lesser General Public License for more details.
  22. #
  23. # You should have received a copy of the GNU Lesser General Public
  24. # License along with this library; if not, write to the Free Software
  25. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  26. # 02110-1301 USA
  27. ######################### END LICENSE BLOCK #########################
  28. from .charsetgroupprober import CharSetGroupProber
  29. from .sbcharsetprober import SingleByteCharSetProber
  30. from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
  31. Latin5CyrillicModel, MacCyrillicModel,
  32. Ibm866Model, Ibm855Model)
  33. from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
  34. from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
  35. # from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
  36. from .langthaimodel import TIS620ThaiModel
  37. from .langhebrewmodel import Win1255HebrewModel
  38. from .hebrewprober import HebrewProber
  39. from .langturkishmodel import Latin5TurkishModel
  40. class SBCSGroupProber(CharSetGroupProber):
  41. def __init__(self):
  42. super(SBCSGroupProber, self).__init__()
  43. self.probers = [
  44. SingleByteCharSetProber(Win1251CyrillicModel),
  45. SingleByteCharSetProber(Koi8rModel),
  46. SingleByteCharSetProber(Latin5CyrillicModel),
  47. SingleByteCharSetProber(MacCyrillicModel),
  48. SingleByteCharSetProber(Ibm866Model),
  49. SingleByteCharSetProber(Ibm855Model),
  50. SingleByteCharSetProber(Latin7GreekModel),
  51. SingleByteCharSetProber(Win1253GreekModel),
  52. SingleByteCharSetProber(Latin5BulgarianModel),
  53. SingleByteCharSetProber(Win1251BulgarianModel),
  54. # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
  55. # after we retrain model.
  56. # SingleByteCharSetProber(Latin2HungarianModel),
  57. # SingleByteCharSetProber(Win1250HungarianModel),
  58. SingleByteCharSetProber(TIS620ThaiModel),
  59. SingleByteCharSetProber(Latin5TurkishModel),
  60. ]
  61. hebrew_prober = HebrewProber()
  62. logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,
  63. False, hebrew_prober)
  64. visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,
  65. hebrew_prober)
  66. hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
  67. self.probers.extend([hebrew_prober, logical_hebrew_prober,
  68. visual_hebrew_prober])
  69. self.reset()