-
Notifications
You must be signed in to change notification settings - Fork 2
/
object-vasconcellos.py
executable file
·101 lines (73 loc) · 4.14 KB
/
object-vasconcellos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# encoding: utf-8
import csv
import os
import sys
import SeSG
from transformers import BertTokenizer, BertForMaskedLM
def main():
"""Main function."""
global author
levenshtein_distance = 4
lda_iterations = 5000
min_df_list = [0.1, 0.2, 0.3, 0.4]
number_topics_list = [1, 2, 3, 4, 5]
number_words_list = [5, 6, 7, 8, 9, 10]
enrichment_list = [0, 1, 2, 3]
author = 'vasconcellos'
pub_year_one = 2015 # 0 = disable pub_year
pub_year_two = 0 # 0 = disable pub_year
qgs_size = 10
gs_size = 30
qgs_txt = 'files-qgs/%s-review/qgs-txt/metadata' % author
# Running CERMINE (Change the path to the .jar file and to the input folder)
# All the articles in .pdf format were hidden due to their publication restrictions.
# print("Loading CERMINE...\n")
# cermine = "java -cp cermine-impl-1.14-20180204.213009-17-jar-with-dependencies.jar " \
# "pl.edu.icm.cermine.ContentExtractor -path " \
# "/home/fuchs/Documentos/SeSG/files-qgs/%s-review/gs-pdf/ -outputs text" % author
# os.system(cermine)
print("Randomize QGS...\n")
SeSG.randomize_qgs(qgs_size, gs_size, author)
print("Doing Snowballing...\n")
title_list, adjacency_matrix, final_edges = SeSG.snowballing(author)
print("Loading BERT...\n")
# Load pre-trained model tokenizer (vocabulary).
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights).
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_model.eval()
with open(os.path.join(sys.path[0], 'exits/%s-result.csv' % author), mode='w') as file_output:
file_writer = csv.writer(file_output, delimiter=',')
file_writer.writerow(['min_df', 'Topics', 'Words', 'Similar Words', 'No. Results',
'No. QGS', 'No. GS', 'No. Total'])
for min_df in min_df_list:
for number_topics in number_topics_list:
for number_words in number_words_list:
print("Test with " + str(number_topics) + " topics and " + str(number_words) + " words in " + str(
min_df) + " min_df:")
print("\n")
dic, tf = SeSG.bag_of_words(min_df, qgs_txt)
lda = SeSG.lda_algorithm(tf, lda_iterations, number_topics)
for enrichment in enrichment_list:
string = SeSG.string_formulation(lda, dic, number_words, number_topics, enrichment,
levenshtein_distance, pub_year_one, pub_year_two,
bert_model, bert_tokenizer, author)
scopus_number_results = SeSG.scopus_search(string)
qgs, gs, result_name_list, manual_comparation = SeSG.open_necessary_files(author)
counter_one = SeSG.similarity_score_qgs(qgs, result_name_list, manual_comparation, author)
counter_two, list_graph = SeSG.similarity_score_gs(gs, result_name_list, manual_comparation,
author)
counter_total = SeSG.graph(list_graph, title_list, adjacency_matrix, final_edges,
min_df, number_topics, number_words, enrichment)
file_writer.writerow(
[min_df, number_topics, number_words, enrichment, scopus_number_results, counter_one,
counter_two, counter_total])
print("String with " + str(enrichment) + " similar words: " + str(string))
print("Generating " + str(scopus_number_results) + " results with " +
str(counter_one) + " of the QGS articles, " + str(counter_two) +
" of the GS articles (without snowballing) and " + str(counter_total) +
" of the GS articles (with snowballing).")
print("\n")
file_output.close()
if __name__ == "__main__":
main()