-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReservoirSampling.py
53 lines (45 loc) · 1.41 KB
/
ReservoirSampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import random
import pandas as pd
import csv
import InputOutput as io
def removePreviousSelected():
previous = []
with open(r'D:\Python\NLP\FatAcceptance\Training\Final\Labeled.csv', encoding='utf-8') as f:
reader = csv.reader(f)
line = 0
for row in reader:
if line != 0:
previous.append(int(row[0]))
line += 1
return previous
def selectKItems(stream, k, n):
i = 0
reservoir = [0] * k
for i in range(k):
reservoir[i] = stream[i]
while i < n:
j = random.randrange(i + 1)
if j < k:
reservoir[j] = stream[i]
i += 1
return reservoir
def main():
csvIn = pd.read_csv(r'D:\Python\NLP\FatAcceptance\Overall\WithoutRetweets.csv')
df = csvIn.to_dict('index')
previous = removePreviousSelected()
indices = []
for i in range(0, len(df)):
if df[i]['num'] in previous:
continue
indices.append(i)
selected_indices = selectKItems(indices, 26, len(indices))
selected = []
for i in range(0, len(selected_indices)):
subselected = []
for key in df[selected_indices[i]]:
subselected.append(df[selected_indices[i]][key])
subselected.append('')
selected.append(subselected)
io.csvOut(r'Training\Short3.csv', cols=['num', 'id', 'date', 'text', 'label'], data=selected)
if __name__ == '__main__':
main()