-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinverted-index.py
63 lines (50 loc) · 1.69 KB
/
inverted-index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
'''
Inverted index construction
Contains:
- slight background to data used, nltk
- helper data structures used for searching
> term frequency list
> inverted_index
> maximum term frequency for every document
- inverted index construction
'''
import nltk
from nltk.corpus import gutenberg
# Project gutenberg has dataset of 25000 books
# 18 of those books are available within nltk as nltk.corpus.gutenberg
# Names of books
file_names = gutenberg.fileids()
print file_names
# Map from file names to file ids
indexer = 0
filename_map = {}
for book_name in file_names:
filename_map[book_name] = indexer
indexer = indexer+1
# Creating dictionary of term frequencies
term_frequency = {}
for book_name in file_names:
for word in gutenberg.words(book_name):
if term_frequency.has_key(word):
term_frequency[word] = term_frequency[word] + 1
else:
term_frequency[word] = 1
# Inverted index containing document-wise frequency
inverted_index = {}
for book_name in file_names:
for word in gutenberg.words(book_name):
if inverted_index.has_key(word):
posting_list = inverted_index[word]
if posting_list.has_key(book_name):
posting_list[book_name] = posting_list[book_name] + 1
else:
posting_list[book_name] = 1
else:
inverted_index[word] = {book_name:1}
# maximum term frequency for every document
max_frequency = {}
for book_name in file_names:
max_frequency[book_name] = 0
for posting_list in inverted_index.values():
for book_name in posting_list.keys():
max_frequency[book_name] = max(max_frequency[book_name], posting_list[book_name])