348 lines
13 KiB
Python
348 lines
13 KiB
Python
|
# Natural Language Toolkit: Collocations Application
|
||
|
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
|
||
|
# Copyright (C) 2001-2018 NLTK Project
|
||
|
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
#
|
||
|
|
||
|
|
||
|
from __future__ import division
|
||
|
|
||
|
import threading
|
||
|
|
||
|
from six.moves import queue as q
|
||
|
from six.moves.tkinter_font import Font
|
||
|
from six.moves.tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu,
|
||
|
OptionMenu, SUNKEN, Scrollbar, StringVar,
|
||
|
Text, Tk)
|
||
|
|
||
|
from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino,
|
||
|
indian, floresta, mac_morpho, machado, cess_esp)
|
||
|
from nltk.util import in_idle
|
||
|
from nltk.probability import FreqDist
|
||
|
|
||
|
|
||
|
CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
|
||
|
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
|
||
|
POLL_INTERVAL = 100
|
||
|
|
||
|
_DEFAULT = 'English: Brown Corpus (Humor)'
|
||
|
_CORPORA = {
|
||
|
'Catalan: CESS-CAT Corpus':
|
||
|
lambda: cess_cat.words(),
|
||
|
'English: Brown Corpus':
|
||
|
lambda: brown.words(),
|
||
|
'English: Brown Corpus (Press)':
|
||
|
lambda: brown.words(categories=['news', 'editorial', 'reviews']),
|
||
|
'English: Brown Corpus (Religion)':
|
||
|
lambda: brown.words(categories='religion'),
|
||
|
'English: Brown Corpus (Learned)':
|
||
|
lambda: brown.words(categories='learned'),
|
||
|
'English: Brown Corpus (Science Fiction)':
|
||
|
lambda: brown.words(categories='science_fiction'),
|
||
|
'English: Brown Corpus (Romance)':
|
||
|
lambda: brown.words(categories='romance'),
|
||
|
'English: Brown Corpus (Humor)':
|
||
|
lambda: brown.words(categories='humor'),
|
||
|
'English: NPS Chat Corpus':
|
||
|
lambda: nps_chat.words(),
|
||
|
'English: Wall Street Journal Corpus':
|
||
|
lambda: treebank.words(),
|
||
|
'Chinese: Sinica Corpus':
|
||
|
lambda: sinica_treebank.words(),
|
||
|
'Dutch: Alpino Corpus':
|
||
|
lambda: alpino.words(),
|
||
|
'Hindi: Indian Languages Corpus':
|
||
|
lambda: indian.words(files='hindi.pos'),
|
||
|
'Portuguese: Floresta Corpus (Portugal)':
|
||
|
lambda: floresta.words(),
|
||
|
'Portuguese: MAC-MORPHO Corpus (Brazil)':
|
||
|
lambda: mac_morpho.words(),
|
||
|
'Portuguese: Machado Corpus (Brazil)':
|
||
|
lambda: machado.words(),
|
||
|
'Spanish: CESS-ESP Corpus':
|
||
|
lambda: cess_esp.words()
|
||
|
}
|
||
|
|
||
|
class CollocationsView:
|
||
|
_BACKGROUND_COLOUR='#FFF' #white
|
||
|
|
||
|
def __init__(self):
|
||
|
self.queue = q.Queue()
|
||
|
self.model = CollocationsModel(self.queue)
|
||
|
self.top = Tk()
|
||
|
self._init_top(self.top)
|
||
|
self._init_menubar()
|
||
|
self._init_widgets(self.top)
|
||
|
self.load_corpus(self.model.DEFAULT_CORPUS)
|
||
|
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||
|
|
||
|
def _init_top(self, top):
|
||
|
top.geometry('550x650+50+50')
|
||
|
top.title('NLTK Collocations List')
|
||
|
top.bind('<Control-q>', self.destroy)
|
||
|
top.protocol('WM_DELETE_WINDOW', self.destroy)
|
||
|
top.minsize(550,650)
|
||
|
|
||
|
def _init_widgets(self, parent):
|
||
|
self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1))
|
||
|
self._init_corpus_select(self.main_frame)
|
||
|
self._init_results_box(self.main_frame)
|
||
|
self._init_paging(self.main_frame)
|
||
|
self._init_status(self.main_frame)
|
||
|
self.main_frame.pack(fill='both', expand=True)
|
||
|
|
||
|
def _init_corpus_select(self, parent):
|
||
|
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||
|
self.var = StringVar(innerframe)
|
||
|
self.var.set(self.model.DEFAULT_CORPUS)
|
||
|
Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
|
||
|
|
||
|
other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS)
|
||
|
om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
|
||
|
om['borderwidth'] = 0
|
||
|
om['highlightthickness'] = 1
|
||
|
om.pack(side='left')
|
||
|
innerframe.pack(side='top', fill='x', anchor='n')
|
||
|
|
||
|
def _init_status(self, parent):
|
||
|
self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0)
|
||
|
self.status.pack(side='top', anchor='sw')
|
||
|
|
||
|
def _init_menubar(self):
|
||
|
self._result_size = IntVar(self.top)
|
||
|
menubar = Menu(self.top)
|
||
|
|
||
|
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
||
|
filemenu.add_command(label='Exit', underline=1,
|
||
|
command=self.destroy, accelerator='Ctrl-q')
|
||
|
menubar.add_cascade(label='File', underline=0, menu=filemenu)
|
||
|
|
||
|
editmenu = Menu(menubar, tearoff=0)
|
||
|
rescntmenu = Menu(editmenu, tearoff=0)
|
||
|
rescntmenu.add_radiobutton(label='20', variable=self._result_size,
|
||
|
underline=0, value=20, command=self.set_result_size)
|
||
|
rescntmenu.add_radiobutton(label='50', variable=self._result_size,
|
||
|
underline=0, value=50, command=self.set_result_size)
|
||
|
rescntmenu.add_radiobutton(label='100', variable=self._result_size,
|
||
|
underline=0, value=100, command=self.set_result_size)
|
||
|
rescntmenu.invoke(1)
|
||
|
editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
|
||
|
|
||
|
menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
|
||
|
self.top.config(menu=menubar)
|
||
|
|
||
|
def set_result_size(self, **kwargs):
|
||
|
self.model.result_count = self._result_size.get()
|
||
|
|
||
|
def _init_results_box(self, parent):
|
||
|
innerframe = Frame(parent)
|
||
|
i1 = Frame(innerframe)
|
||
|
i2 = Frame(innerframe)
|
||
|
vscrollbar = Scrollbar(i1, borderwidth=1)
|
||
|
hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
|
||
|
self.results_box = Text(i1,
|
||
|
font=Font(family='courier', size='16'),
|
||
|
state='disabled', borderwidth=1,
|
||
|
yscrollcommand=vscrollbar.set,
|
||
|
xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
|
||
|
self.results_box.pack(side='left', fill='both', expand=True)
|
||
|
vscrollbar.pack(side='left', fill='y', anchor='e')
|
||
|
vscrollbar.config(command=self.results_box.yview)
|
||
|
hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
|
||
|
hscrollbar.config(command=self.results_box.xview)
|
||
|
#there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
||
|
Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
|
||
|
i1.pack(side='top', fill='both', expand=True, anchor='n')
|
||
|
i2.pack(side='bottom', fill='x', anchor='s')
|
||
|
innerframe.pack(side='top', fill='both', expand=True)
|
||
|
|
||
|
def _init_paging(self, parent):
|
||
|
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||
|
self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
|
||
|
prev.pack(side='left', anchor='center')
|
||
|
self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled')
|
||
|
next.pack(side='right', anchor='center')
|
||
|
innerframe.pack(side='top', fill='y')
|
||
|
self.reset_current_page()
|
||
|
|
||
|
def reset_current_page(self):
|
||
|
self.current_page = -1
|
||
|
|
||
|
def _poll(self):
|
||
|
try:
|
||
|
event = self.queue.get(block=False)
|
||
|
except q.Empty:
|
||
|
pass
|
||
|
else:
|
||
|
if event == CORPUS_LOADED_EVENT:
|
||
|
self.handle_corpus_loaded(event)
|
||
|
elif event == ERROR_LOADING_CORPUS_EVENT:
|
||
|
self.handle_error_loading_corpus(event)
|
||
|
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||
|
|
||
|
def handle_error_loading_corpus(self, event):
|
||
|
self.status['text'] = 'Error in loading ' + self.var.get()
|
||
|
self.unfreeze_editable()
|
||
|
self.clear_results_box()
|
||
|
self.freeze_editable()
|
||
|
self.reset_current_page()
|
||
|
|
||
|
def handle_corpus_loaded(self, event):
|
||
|
self.status['text'] = self.var.get() + ' is loaded'
|
||
|
self.unfreeze_editable()
|
||
|
self.clear_results_box()
|
||
|
self.reset_current_page()
|
||
|
#self.next()
|
||
|
collocations = self.model.next(self.current_page + 1)
|
||
|
self.write_results(collocations)
|
||
|
self.current_page += 1
|
||
|
|
||
|
def corpus_selected(self, *args):
|
||
|
new_selection = self.var.get()
|
||
|
self.load_corpus(new_selection)
|
||
|
|
||
|
def previous(self):
|
||
|
self.freeze_editable()
|
||
|
collocations = self.model.prev(self.current_page - 1)
|
||
|
self.current_page= self.current_page - 1
|
||
|
self.clear_results_box()
|
||
|
self.write_results(collocations)
|
||
|
self.unfreeze_editable()
|
||
|
|
||
|
def __next__(self):
|
||
|
self.freeze_editable()
|
||
|
collocations = self.model.next(self.current_page + 1)
|
||
|
self.clear_results_box()
|
||
|
self.write_results(collocations)
|
||
|
self.current_page += 1
|
||
|
self.unfreeze_editable()
|
||
|
|
||
|
def load_corpus(self, selection):
|
||
|
if self.model.selected_corpus != selection:
|
||
|
self.status['text'] = 'Loading ' + selection + '...'
|
||
|
self.freeze_editable()
|
||
|
self.model.load_corpus(selection)
|
||
|
|
||
|
def freeze_editable(self):
|
||
|
self.prev['state'] = 'disabled'
|
||
|
self.next['state'] = 'disabled'
|
||
|
|
||
|
def clear_results_box(self):
|
||
|
self.results_box['state'] = 'normal'
|
||
|
self.results_box.delete("1.0", END)
|
||
|
self.results_box['state'] = 'disabled'
|
||
|
|
||
|
def fire_event(self, event):
|
||
|
#Firing an event so that rendering of widgets happen in the mainloop thread
|
||
|
self.top.event_generate(event, when='tail')
|
||
|
|
||
|
def destroy(self, *e):
|
||
|
if self.top is None: return
|
||
|
self.top.after_cancel(self.after)
|
||
|
self.top.destroy()
|
||
|
self.top = None
|
||
|
|
||
|
def mainloop(self, *args, **kwargs):
|
||
|
if in_idle(): return
|
||
|
self.top.mainloop(*args, **kwargs)
|
||
|
|
||
|
def unfreeze_editable(self):
|
||
|
self.set_paging_button_states()
|
||
|
|
||
|
def set_paging_button_states(self):
|
||
|
if self.current_page == -1 or self.current_page == 0:
|
||
|
self.prev['state'] = 'disabled'
|
||
|
else:
|
||
|
self.prev['state'] = 'normal'
|
||
|
if self.model.is_last_page(self.current_page):
|
||
|
self.next['state'] = 'disabled'
|
||
|
else:
|
||
|
self.next['state'] = 'normal'
|
||
|
|
||
|
def write_results(self, results):
|
||
|
self.results_box['state'] = 'normal'
|
||
|
row = 1
|
||
|
for each in results:
|
||
|
self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
|
||
|
row += 1
|
||
|
self.results_box['state'] = 'disabled'
|
||
|
|
||
|
class CollocationsModel:
|
||
|
def __init__(self, queue):
|
||
|
self.result_count = None
|
||
|
self.selected_corpus = None
|
||
|
self.collocations = None
|
||
|
self.CORPORA = _CORPORA
|
||
|
self.DEFAULT_CORPUS = _DEFAULT
|
||
|
self.queue = queue
|
||
|
self.reset_results()
|
||
|
|
||
|
def reset_results(self):
|
||
|
self.result_pages = []
|
||
|
self.results_returned = 0
|
||
|
|
||
|
def load_corpus(self, name):
|
||
|
self.selected_corpus = name
|
||
|
self.collocations = None
|
||
|
runner_thread = self.LoadCorpus(name, self)
|
||
|
runner_thread.start()
|
||
|
self.reset_results()
|
||
|
|
||
|
def non_default_corpora(self):
|
||
|
copy = []
|
||
|
copy.extend(list(self.CORPORA.keys()))
|
||
|
copy.remove(self.DEFAULT_CORPUS)
|
||
|
copy.sort()
|
||
|
return copy
|
||
|
|
||
|
def is_last_page(self, number):
|
||
|
if number < len(self.result_pages):
|
||
|
return False
|
||
|
return self.results_returned + (number - len(self.result_pages)) * self.result_count >= len(self.collocations)
|
||
|
|
||
|
def next(self, page):
|
||
|
if (len(self.result_pages) - 1) < page:
|
||
|
for i in range(page - (len(self.result_pages) - 1)):
|
||
|
self.result_pages.append(self.collocations[self.results_returned:self.results_returned+self.result_count])
|
||
|
self.results_returned += self.result_count
|
||
|
return self.result_pages[page]
|
||
|
|
||
|
def prev(self, page):
|
||
|
if page == -1:
|
||
|
return []
|
||
|
return self.result_pages[page]
|
||
|
|
||
|
class LoadCorpus(threading.Thread):
|
||
|
def __init__(self, name, model):
|
||
|
threading.Thread.__init__(self)
|
||
|
self.model, self.name = model, name
|
||
|
|
||
|
def run(self):
|
||
|
try:
|
||
|
words = self.model.CORPORA[self.name]()
|
||
|
from operator import itemgetter
|
||
|
text = [w for w in words if len(w) > 2]
|
||
|
fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1))
|
||
|
vocab = FreqDist(text)
|
||
|
scored = [((w1,w2), fd[(w1,w2)] ** 3 / (vocab[w1] * vocab[w2])) for w1, w2 in fd]
|
||
|
scored.sort(key=itemgetter(1), reverse=True)
|
||
|
self.model.collocations = list(map(itemgetter(0), scored))
|
||
|
self.model.queue.put(CORPUS_LOADED_EVENT)
|
||
|
except Exception as e:
|
||
|
print(e)
|
||
|
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
||
|
|
||
|
#def collocations():
|
||
|
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
|
||
|
|
||
|
def app():
|
||
|
c = CollocationsView()
|
||
|
c.mainloop()
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
app()
|
||
|
|
||
|
__all__ = ['app']
|