Skip to content

Latest commit

 

History

History
232 lines (150 loc) · 3.46 KB

README.md

File metadata and controls

232 lines (150 loc) · 3.46 KB

%load_ext autoreload
%autoreload 2
from src.textlabelling.dataPrep import DataPrep
from src.textlabelling.labelling import Labelling
from src.textlabelling.nerstats import NerStats
from src.textlabelling.csvmodel import CSVModel
from src.textlabelling.tainner import TrainNer, save_model, evaluate
from src.textlabelling.dbconnect import Connect
from src.textlabelling.model import Model

import random

random.seed(0)
!pwd

1. Labelling Process

i. Data Labelling Container

TRAIN_DATA=[]

ii. Prepare the dataset for Labelling

Data_prep = DataPrep('xxxxxxxx.csv')
train, test = Data_prep.split_data()
Data_prep.filename
!head -n 6 xxxxxxxx.csv

iii. Label data for Training

trainset = DataPrep.text_generator(train)
number = 0
n_counter = 0
def main():
    text =next(trainset)
    text = text.lower()
    print('')
    print(text)
    print('')
    Label = nerlabelling.Labelling(text)
    for item in Label.token_to_tuple():
        print(item)

    result, counter = Label.text_entities_construct()
    
    TRAIN_DATA.append(result)
    return counter
counter = main()
number = number + 1
n_counter = n_counter + counter
#display.HTML("<p><b>Counter:</b><H1>"+str(number)+' '+str(n_counter)+"</H1></p>")
display.HTML("<table><tr><th><H1><center>Counter:</center></H1></th></tr><tr><th>Global</th><th>Internal</th></tr><tr><td><H1>"
             +str(number)+
             "</H1></td><td><H1>"
             +str(n_counter)+
             "</H1><td></tr></table>")

iv. Save Labelled data

Stats = NerStats()
Stats.save_labelled_data(data=TRAIN_DATA, file_name='labelled_data/labelleddata.pkl')
Stats.distribution_visualizer(data)

v. Load Train data

TRAIN_DATA2 = Stats.load_labelled_data('labelled_data/labelleddat.pkl')

vi. Distribution of Labelled data

data = Stats.data_distribution(TRAIN_DATA2)
data.items()
#### v. Fix Training data 
for idx, value in enumerate(TRAIN_DATA2):
    if idx == 1236:
        #if 'payments' in value[1]['entities'][0]:
            #print(idx,  value[1]['entities'])
             print(idx,  value)
TRAIN_DATA2[114] = ('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', {'entities': [(0, 29, 'Payments'), (116, 128, 'Payments')]})

2. Training Process

i. Train Model

model = 'model_vi'
n_accuracy = {}
Trainer = TrainNer(TRAIN_DATA2, n_iter=150)
nlp,losses = Trainer.train_model()
#n_accuracy[model]=100-losses['ner']

ii. Save Model

type(nlp)
NerStats.save_model(nlp, model)
n_accuracy

3. Evaluate

TEST_DATA = Stats.load_labelled_data('labelled_data/labelledtest.pkl')
evaluate(nlp, TEST_DATA)
### 4. 

3. TEST MODEL for PREDICTION

template = """
 select created_date, reference_ticket, nps_verbatim,nps_score from table'
"""
Model = Model('model_v1', template, 'config/config.ini','Redshift_prod')

i. Create table if it exists

Model.create_table('public.nps_table')

ii. Update template

Model.template = template

iii. Insert the dataset into a table in redshift for instance

Model.insert_to_redshift()