1 def _build_vocab(filename):
2 data = _read_words(filename)
3
4 counter = collections.Counter(data)
5 count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
6
7 words, _ = list(zip(*count_pairs))
8 word_to_id = dict(zip(words, range(len(words))))
9
10 return word_to_id
Figure 1: Flow chart of _build_vocab.py file