Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example Configured Files: main.py and expander_factory.py - For Review #12

Open
kpoots opened this issue Oct 3, 2020 · 0 comments
Open

Comments

@kpoots
Copy link

kpoots commented Oct 3, 2020

My initial configured files, working with ReQue. Cut/paste as you wish.

######################
#../ReQue/qe/main.py

#Oct 2: customized for ReQue trial1 - KP

#example command to use this module (python3 .. --metric.. all on one line):
#for query expansion:

#python3 -u main.py --anserini ../anserini/ --corpus robust04
#--index ../ds/robust04/lucene-index.robust04.pos+docvectors+rawdocs
#--output ../ds/qe/robust04/ --ranker bm25
#--metric map 2>&1 | tee robust04.log &

#see cmn/expander_factory.py for example of expanders, stemmers configuration

#Basic ReQue installation steps (for details see git repository)

#1. Make a directory and clone the ReQue git repository:
#git clone https://github.com/hosseinfani/ReQue.git
#2. Set up anserini:
#cd ReQue; rm -rf anserini; git clone https://github.com/castorini/anserini.git --recurse-submodules
#3. Build anserini:
#cd anserini; git clone https://github.com/castorini/anserini.git --recurse-submodules
#cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
#cd tools/eval/ndeval && make && cd ../../..
#4. You must download/unpack datasets (queries and qrels) plus word embeddings
#(Word2vec format FastText and Gloves) before using this application

#TODO: list all library requirements such as stemmers, tagme, ...
import os, traceback, operator, sys, math
from os import path
import pandas as pd
import argparse

#build anserini (maven) for doing A) indexing, B) information retrieval, and C) evaluation
#A) INDEX DOCUMENTS
#robust04
#$> query_expansion/anserini/target/appassembler/bin/IndexCollection -collection TrecCollection -input Robust04-Corpus -index lucene-index.robust04.pos+docvectors+rawdocs -generator JsoupGenerator -threads 44 -storePositions -storeDocvectors -storeRawDocs 2>&1 | tee log.robust04.pos+docvectors+rawdocs &
#Already done in https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz

#`Gov2:
#$> query_expansion/anserini/target/appassembler/bin/IndexCollection -collection TrecwebCollection -input Gov2-Corpus -index lucene-index.gov2.pos+docvectors+rawdocs -generator JsoupGenerator -threads 44 -storePositions -storeDocvectors -storeRawDocs 2>&1 | tee log.gov2.pos+docvectors+rawdocs &

#ClueWeb09-B-Corpus:
#$> query_expansion/anserini/target/appassembler/bin/IndexCollection -collection ClueWeb09Collection -input ClueWeb09-B-Corpus -index lucene-index.cw09b.pos+docvectors+rawdocs -generator JsoupGenerator -threads 44 -storePositions -storeDocvectors -storeRawDocs 2>&1 | tee log.cw09b.pos+docvectors+rawdocs &

#ClueWeb12-B-Corpus:
#$> query_expansion/anserini/target/appassembler/bin/IndexCollection -collection ClueWeb12Collection -input ClueWeb12-B-Corpus -index lucene-index.cw12b13.pos+docvectors+rawdocs -generator JsoupGenerator -threads 44 -storePositions -storeDocvectors -storeRawDocs 2>&1 | tee log.cw12b13.pos+docvectors+rawdocs &

#B) INFORMATION RETREIVAL: Ranking & Reranking
#$> query_expansion/anserini/target/appassembler/bin/SearchCollection -bm25 -threads 44 -topicreader Trec -index query_expansion/ds/robust04/index-robust04-20191213 -topics query_expansion/ds/robust04/topics.robust04.txt -output query_expansion/ds/robust04/topics.robust04.bm25.txt

#C) EVAL
#$> eval/trec_eval.9.0.4/trec_eval -q -m map query_expansion/ds/robust04/qrels.robust04.txt query_expansion/ds/robust04/topics.robust04.bm25.map.txt

#q: query
#Q: set of queries
#q_: expanded query (q')
#Q_: set of expanded queries(Q')

from cmn import expander_factory as ef
from expanders.abstractqexpander import AbstractQExpander

def generate(Qfilename, expanders, output):
df = pd.DataFrame()
model_errs = dict()
for model in expanders:
model_name = model.get_model_name()
try:
Q_filename = '{}.{}.txt'.format(output, model_name)
# if not os.path.isfile(Q_filename) or overwrite:
model.write_expanded_queries(Qfilename, Q_filename)
except:
model_errs[model_name] = traceback.format_exc()
continue
for model_err, msg in model_errs.items():
print('INFO: MAIN: GENERATE: There has been error in {}!\n{}'.format(model_err, msg))

def search(expanders, rankers, topicreader, index, anserini, output):
# Information Retrieval using Anserini
rank_cmd = '{}target/appassembler/bin/SearchCollection'.format(anserini)
model_errs = dict()

for model in expanders:
    model_name = model.get_model_name()
    try:
        Q_filename = '{}.{}.txt'.format(output, model_name)
        for ranker in rankers:
            Q_pred = '{}.{}.{}.txt'.format(output, model_name, ef.get_ranker_name(ranker))
            cli_cmd = '\"{}\" {} -threads 44 -topicreader {} -index {} -topics {} -output {}'.format(rank_cmd, ranker, topicreader, index, Q_filename, Q_pred)
            print('{}\n'.format(cli_cmd))
            stream = os.popen(cli_cmd)
            print(stream.read())
    except:
        model_errs[model_name] = traceback.format_exc()
        continue
for model_err, msg in model_errs.items():
    print('INFO: MAIN: SEARCH: There has been error in {}!\n{}'.format(model_err, msg))

def evaluate(expanders, Qrels, rankers, metrics, anserini, output):
# Evaluation using trec_eval
#oct 2: path to trec_eval has been updated - KP
eval_cmd = '{}tools/eval/trec_eval.9.0.4/trec_eval'.format(anserini)
model_errs = dict()

for model in expanders:
    model_name = model.get_model_name()
    try:
        for ranker in rankers:
            Q_pred = '{}.{}.{}.txt'.format(output, model_name, ef.get_ranker_name(ranker))
            for metric in metrics:
                Q_eval = '{}.{}.{}.{}.txt'.format(output, model_name, ef.get_ranker_name(ranker), metric)
                cli_cmd = '\"{}\" -q -m {} {} {} > {}'.format(eval_cmd, metric, Qrels, Q_pred, Q_eval)
                print('{}\n'.format(cli_cmd))
                stream = os.popen(cli_cmd)
                print(stream.read())
    except:
        model_errs[model_name] = traceback.format_exc()
        continue
for model_err, msg in model_errs.items():
    print('INFO: MAIN: EVALUATE: There has been error in {}!\n{}'.format(model_err, msg))

def aggregate(expanders, rankers, metrics, output):
df = pd.DataFrame()
model_errs = dict()
queryids = pd.DataFrame()
for model in expanders:
model_name = model.get_model_name()
# try:
Q_filename = '{}.{}.txt'.format(output, model_name)
Q_ = model.read_expanded_queries(Q_filename)
for ranker in rankers:
for metric in metrics:
Q_eval = '{}.{}.{}.{}.txt'.format(output, model_name, ef.get_ranker_name(ranker), metric)
#the last row is average over all. skipped by [:-1]
values = pd.read_csv(Q_eval, usecols=[1,2],names=['qid', 'value'], header=None,sep='\t')[:-1]
values.set_index('qid', inplace=True, verify_integrity=True)

            for idx, r in Q_.iterrows():
                Q_.loc[idx, '{}.{}.{}'.format(model_name, ef.get_ranker_name(ranker), metric)] = values.loc[str(r.qid), 'value'] if str(r.qid) in values.index else None

    # except:
    #     model_errs[model_name] = traceback.format_exc()
    #     continue
    df = pd.concat([df, Q_], axis=1)

filename = '{}.{}.{}.all.csv'.format(output, '.'.join([ef.get_ranker_name(r) for r in rankers]), '.'.join(metrics))
df.to_csv(filename, index=False)
# for model_err, msg in model_errs.items():
#     print('INFO: MAIN: AGGREGATE: There has been error in {}!\n{}'.format(model_err, msg))
return filename

def build(input, expanders, rankers, metrics, output):
base_model_name = AbstractQExpander().get_model_name()
df = pd.read_csv(input)
ds_df = df.iloc[:, :1+1+len(rankers)*len(metrics)]#the original query info
ds_df['star_model_count'] = 0
for idx, row in df.iterrows():
star_models = dict()
for model in expanders:
model_name = model.get_model_name()
if model_name == base_model_name:
continue
flag = True
sum = 0
for ranker in rankers:
for metric in metrics:
v = df.loc[idx, '{}.{}.{}'.format(model_name, ef.get_ranker_name(ranker), metric)]
v = v if not pd.isna(v) else 0
v0 = df.loc[idx, '{}.{}.{}'.format(base_model_name, ef.get_ranker_name(ranker), metric)]
v0 = v0 if not pd.isna(v0) else 0
if v <= v0:
flag = False
break
sum += v ** 2
if flag:
star_models[model] = sum

    if len(star_models) > 0:
        ds_df.loc[idx, 'star_model_count'] = len(star_models.keys())
        star_models_sorted = {k: v for k, v in sorted(star_models.items(), key=lambda item: item[1], reverse=True)}
        for i, star_model in enumerate(star_models_sorted.keys()):
            ds_df.loc[idx, '{}.{}'.format('method', i + 1)] = star_model.get_model_name()
            ds_df.loc[idx, '{}.{}'.format('metric', i + 1)] = math.sqrt(star_models[star_model])
            ds_df.loc[idx, '{}.{}'.format('query', i + 1)] = df.loc[idx, '{}'.format(star_model.get_model_name())]
    else:
        ds_df.loc[idx, 'star_model_count'] = 0
filename = '{}.{}.{}.dataset.csv'.format(output, '.'.join([ef.get_ranker_name(r) for r in rankers]), '.'.join(metrics))
ds_df.to_csv(filename, index=False)
return filename

def run(db, rankers, metrics, anserini, index, output, rf=True, op=[]):

if db == 'robust04':
    output = '{}topics.robust04'.format(output)
    # index = '/data/anserini/lucene-index.robust04.pos+docvectors+rawdocs'

    expanders = ef.get_nrf_expanders()
    if rf:#local analysis
        expanders += ef.get_rf_expanders(rankers=rankers, index=index, anserini=anserini, output=output)

    if 'generate' in op:generate(Qfilename='../ds/robust04/topics.robust04.txt', expanders=expanders, output=output)
    if 'search' in op:search(  expanders=expanders, rankers=rankers, topicreader='Trec', index=index, anserini=anserini, output=output)
    if 'evaluate' in op:evaluate(expanders=expanders, Qrels='../ds/robust04/qrels.robust04.txt', rankers=rankers, metrics=metrics, anserini=anserini, output=output)
    if 'build' in op:
        result = aggregate(expanders=expanders, rankers=rankers,metrics=metrics, output=output)
        build(input=result, expanders=expanders, rankers=rankers,metrics=metrics, output=output)

if db == 'gov2':
    # index = '/data/anserini/lucene-index.gov2.pos+docvectors+rawdocs'
    topicreader = 'Trec'

    results = []
    for r in ['4.701-750', '5.751-800', '6.801-850']:
        output = '{}topics.terabyte0{}'.format(output, r)

        expanders = ef.get_nrf_expanders()
        if rf:
            expanders += ef.get_rf_expanders(rankers=rankers, index=index, anserini=anserini, output=output)

        if 'generate' in op:generate(Qfilename='../ds/gov2/{}.terabyte0{}.txt'.format('topics', r), expanders=expanders, output=output)
        if 'search' in op:search(  expanders=expanders, rankers=rankers, topicreader=topicreader, index=index, anserini=anserini, output=output)
        if 'evaluate' in op:evaluate(expanders=expanders, Qrels='../ds/gov2/qrels.terabyte0{}.txt'.format(r), rankers=rankers, metrics=metrics, anserini=anserini, output=output)
        if 'build' in op:
            result = aggregate(expanders=expanders, rankers=rankers, metrics=metrics, output=output)
            result = build(input=result, expanders=expanders, rankers=rankers, metrics=metrics, output=output)
            results.append(result)

    if 'build' in op:
        output = results[0].replace(results[0].split('/')[-1].split('.')[1], 'gov2').replace(results[0].split('/')[-1].split('.')[2], '701-850')
        df = pd.DataFrame()
        for r in results:
            df = pd.concat([df, pd.read_csv(r)], axis=0, ignore_index=True, sort=False)
        df.to_csv(output, index=False)

if db == 'clueweb09b':
    # index = '/data/anserini/lucene-index.cw09b.pos+docvectors+rawdocs'
    topicreader = 'Webxml'

    results = []
    for r in ['1-50', '51-100', '101-150', '151-200']:
        output = '{}topics.web.{}'.format(output, r)

        expanders = ef.get_nrf_expanders()
        if rf:
            expanders += ef.get_rf_expanders(rankers=rankers, index=index, anserini=anserini, output=output)

        if 'generate' in op:generate(Qfilename='../ds/clueweb09b/topics.web.{}.txt'.format(r), expanders=expanders, output=output)
        if 'search' in op:search(  expanders=expanders, rankers=rankers, topicreader=topicreader, index=index, anserini=anserini, output=output)
        if 'evaluate' in op:evaluate(expanders=expanders, Qrels='../ds/clueweb09b/qrels.web.{}.txt'.format(r), rankers=rankers, metrics=metrics, anserini=anserini, output=output)
        if 'build' in op:
            result = aggregate(expanders=expanders, rankers=rankers, metrics=metrics, output=output)
            result = build(input=result, expanders=expanders, rankers=rankers, metrics=metrics, output=output)
            results.append(result)

    if 'build' in op:
        output = results[0].replace('.'+results[0].split('/')[-1].split('.')[1]+'.', '.clueweb09b.').replace(results[0].split('/')[-1].split('.')[2], '1-200')
        df = pd.DataFrame()
        for r in results:
            df = pd.concat([df, pd.read_csv(r)], axis=0, ignore_index=True, sort=False)
        df.to_csv(output, index=False)

if db == 'clueweb12b13':
    # index = '/data/anserini/lucene-index.cw12b13.pos+docvectors+rawdocs'
    topicreader = 'Webxml'
    results = []
    for r in ['201-250', '251-300']:
        output = '{}topics.web.{}'.format(output, r)

        expanders = ef.get_nrf_expanders()
        if rf:
            expanders += ef.get_rf_expanders(rankers=rankers, index=index, anserini=anserini, output=output)

        if 'generate' in op:generate(Qfilename='../ds/clueweb12b13/topics.web.{}.txt'.format(r), expanders=expanders, output=output)
        if 'search' in op:search(expanders=expanders, rankers=rankers, topicreader=topicreader, index=index, anserini=anserini, output=output)
        if 'evaluate' in op:evaluate(expanders=expanders, Qrels='../ds/clueweb12b13/qrels.web.{}.txt'.format(r), rankers=rankers, metrics=metrics, anserini=anserini, output=output)
        if 'build' in op:
            result = aggregate(expanders=expanders, rankers=rankers, metrics=metrics, output=output)
            result = build(input=result, expanders=expanders, rankers=rankers, metrics=metrics, output=output)
            results.append(result)

    if 'build' in op:
        output = results[0].replace('.'+results[0].split('/')[-1].split('.')[1]+'.', '.clueweb12b13.').replace(results[0].split('/')[-1].split('.')[2], '201-300')
        df = pd.DataFrame()
        for r in results:
            df = pd.concat([df, pd.read_csv(r)], axis=0, ignore_index=True, sort=False)
        df.to_csv(output, index=False)

def addargs(parser):
anserini = parser.add_argument_group('Anserini')
anserini.add_argument('--anserini', type=str, default='../anserini/', help='The path to the anserini library (default: ../anserini/)')

corpus = parser.add_argument_group('Corpus')
corpus.add_argument('--corpus', type=str, choices=['robust04', 'gov2', 'clueweb09b', 'clueweb12b13'], required=True, help='The corpus name; required; (example: robust04)')
corpus.add_argument('--index', type=str, required=True, help='The corpus index; required; (example: ../ds/robust04/lucene-index.robust04.pos+docvectors+rawdocs)')

gold = parser.add_argument_group('Gold Standard Dataset')
gold.add_argument('--output', type=str, required=True, help='The output path for the gold standard dataset; required; (example: ../ds/qe/robust04/')
gold.add_argument('--ranker', type=str, choices=['bm25', 'qld'], default='bm25', help='The ranker name (default: bm25)')
gold.add_argument('--metric', type=str, choices=['map'], default='map', help='The evaluation metric name (default: map)')

#python -u main.py --anserini ../anserini --corpus robust04 --index ../ds/robust04/lucene-index.robust04.pos+docvectors+rawdocs --output ../ds/qe/robust04/ --ranker bm25 --metric map 2>&1 | tee robust04.log &
#python -u main.py --anserini ../anserini --corpus robust04 --index ../ds/robust04/lucene-index.robust04.pos+docvectors+rawdocs --output ../ds/qe/robust04/ --ranker qld --metric map 2>&1 | tee robust04.log &

#python -u main.py --anserini ../anserini --corpus gov2 --index ../ds/robust04/lucene-index.gov2.pos+docvectors+rawdocs --output ../ds/qe/gov2/ --ranker bm25 --metric map 2>&1 | tee gov2.log &
#python -u main.py --anserini ../anserini --corpus gov2 --index ../ds/robust04/lucene-index.gov2.pos+docvectors+rawdocs --output ../ds/qe/gov2/ --ranker qld --metric map 2>&1 | tee gov2.log &

#python -u main.py --anserini ../anserini --corpus clueweb09b --index ../ds/robust04/lucene-index.cw09b.pos+docvectors+rawdocs --output ../ds/qe/clueweb09b/ --ranker bm25 --metric map 2>&1 | tee clueweb09b.log &
#python -u main.py --anserini ../anserini --corpus clueweb09b --index ../ds/robust04/lucene-index.cw09b.pos+docvectors+rawdocs --output ../ds/qe/clueweb09b/ --ranker qld --metric map 2>&1 | tee clueweb09b.log &

#python -u main.py --anserini ../anserini --corpus clueweb12b13 --index ../ds/robust04/lucene-index.cw12b13.pos+docvectors+rawdocs --output ../ds/qe/clueweb12b13/ --ranker bm25 --metric map 2>&1 | tee clueweb12b13.log &
#python -u main.py --anserini ../anserini --corpus clueweb12b13 --index ../ds/robust04/lucene-index.cw12b13.pos+docvectors+rawdocs --output ../ds/qe/clueweb12b13/ --ranker qld --metric map 2>&1 | tee clueweb12b13.log &

if name == "main":
parser = argparse.ArgumentParser(description='ReQue (Refining Queries)')
addargs(parser)
args = parser.parse_args()

## rf: whether to include relevance feedback expanders (local analysis) or not
## op: determines the steps in the pipeline. op=['generate', 'search', 'evaluate', 'build']

run(db=args.corpus.lower(),
    rankers=['-' + args.ranker.lower()],
    metrics=[args.metric.lower()],
    anserini=args.anserini,
    index=args.index,
    output=args.output,
    rf=True,
    op=['generate', 'search', 'evaluate', 'build'])

######################
#../ReQue/qe/cmn/expander_factory.py

#Oct 2: customized for ReQue trial1 - KP
#several expanders & stemmers have been commented-out to work
#with initial datasets

#disabled expanders(/components) for trial1:
#Thesaurus, Anchor 2 versions (replace=True and replace=False),
#Wiki 2 versions (replace=True and replace=False)
#RelevanceFeedback, Docluster, Termluster, Conceptluster

#disabled stemmers for trial1:
#KrovetzStemmer, Trunc5Stemmer

#in some cases, you will need additional libraries and/or configuration
#to be able to use the disabled expanders or stemmers

import sys
sys.path.extend(['../qe'])

#TODO: ServiceFactory: dynamically load the class files in expanders folder and create an instance object
from expanders.abstractqexpander import AbstractQExpander
from expanders.sensedisambiguation import SenseDisambiguation
from expanders.thesaurus import Thesaurus
from expanders.wordnet import Wordnet
from expanders.word2vec import Word2Vec
from expanders.glove import Glove
from expanders.conceptnet import Conceptnet
from expanders.relevancefeedback import RelevanceFeedback
from expanders.docluster import Docluster
from expanders.termluster import Termluster
from expanders.conceptluster import Conceptluster
from expanders.anchor import Anchor
from expanders.tagmee import Tagmee
from expanders.wiki import Wiki

#TODO: ServiceFactory: dynamically load the class files in stemmers folder and create an instance object
from stemmers.krovetz import KrovetzStemmer
from stemmers.lovins import LovinsStemmer
from stemmers.paicehusk import PaiceHuskStemmer
from stemmers.porter import PorterStemmer
from stemmers.porter2 import Porter2Stemmer
from stemmers.sstemmer import SRemovalStemmer
from stemmers.trunc4 import Trunc4Stemmer
from stemmers.trunc5 import Trunc5Stemmer
from expanders.stem import Stem # Stem expander is the wrapper for all stemmers as an expnader :)

#global analysis
def get_nrf_expanders():
expanders = [AbstractQExpander(),
#Thesaurus(),
Wordnet(),
Word2Vec('../pre/wiki-news-300d-1M.vec'),
Glove('../pre/glove.6B.300d'),
#Anchor(anchorfile='../pre/anchor_text_en.ttl', vectorfile='../pre/wiki-anchor-text-en-ttl-300d.vec'),
#Wiki('../pre/temp_model_Wiki'),
Tagmee(),
SenseDisambiguation(),
Conceptnet(),
#Thesaurus(replace=True),
Wordnet(replace=True),
Word2Vec('../pre/wiki-news-300d-1M.vec', replace=True),
Glove('../pre/glove.6B.300d', replace=True),
#Anchor(anchorfile='../pre/anchor_text_en.ttl', vectorfile='../pre/wiki-anchor-text-en-ttl-300d.vec', replace=True),
#Wiki('../pre/temp_model_Wiki', replace=True),
Tagmee(replace=True),
SenseDisambiguation(replace=True),
Conceptnet(replace=True),
#Stem(KrovetzStemmer(jarfile='stemmers/kstem-3.4.jar')),
Stem(LovinsStemmer()),
Stem(PaiceHuskStemmer()),
Stem(PorterStemmer()),
Stem(Porter2Stemmer()),
Stem(SRemovalStemmer()),
Stem(Trunc4Stemmer()),
#Stem(Trunc5Stemmer()),
# since RF needs index and search output which depends on ir method and topics database, we cannot add this here. Instead, we run it individually
# RF assumes that there exist abstractqueryexpansion files
]

return expanders

#local analysis
def get_rf_expanders(rankers, index, anserini, output):
expanders = []
for ranker in rankers:
ranker_name = get_ranker_name(ranker)
#expanders.append(RelevanceFeedback(ranker=ranker_name, prels='{}.abstractqueryexpansion.{}.txt'.format(output, ranker_name),anserini=anserini, index=index))
#expanders.append(Docluster(ranker=ranker_name, prels='{}.abstractqueryexpansion.{}.txt'.format(output, ranker_name),anserini=anserini, index=index)),
#expanders.append(Termluster(ranker=ranker_name, prels='{}.abstractqueryexpansion.{}.txt'.format(output, ranker_name),anserini=anserini, index=index))
#expanders.append(Conceptluster(ranker=ranker_name, prels='{}.abstractqueryexpansion.{}.txt'.format(output, ranker_name), anserini=anserini, index=index))
return expanders

def get_expanders_names(rankers):
expanders = get_nrf_expanders() + get_rf_expanders(rankers, None, None, None)
return [e.get_model_name() for e in expanders]

def get_ranker_name(ranker):
return ranker.replace('-', '').replace(' ', '.')

if name == "main":
print(get_expanders_names(['-bm25', '-bm25 -rm3', '-qld', '-qld -rm3']))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant