1+ import re
2+ import os
3+ import joblib
4+ import json
5+ import pickle
6+ from functools import partial
7+ from collections import Counter
8+
9+ import pandas as pd
10+ from nltk .corpus import stopwords
11+ from nltk import wordpunct_tokenize
12+ from tqdm import tqdm
13+ from sklearn .preprocessing import LabelEncoder
14+ from sklearn .feature_extraction .text import TfidfVectorizer
15+ from sklearn .naive_bayes import MultinomialNB
16+ from sklearn .pipeline import make_pipeline
17+ from sklearn .model_selection import train_test_split
18+ from sklearn .metrics import accuracy_score , classification_report
19+
20+ tqdm .pandas ()
21+
22+ import nltk
23+ nltk .download ('stopwords' )
24+
25+ def tokenize (text , stop_words ):
26+ text = str (text )
27+ text = re .sub (r'[^\w\s]' , '' , text )
28+ text = text .lower ()
29+ tokens = wordpunct_tokenize (text )
30+ tokens = [token for token in tokens if token not in stop_words ]
31+ return tokens
32+
33+ class CustomDataset :
34+ def __init__ (self , df , max_vocab , max_len , name ):
35+ stop_words = set (stopwords .words ('english' ))
36+
37+ df ['tokens' ] = df .input_text .progress_apply (
38+ partial (tokenize , stop_words = stop_words ),
39+ )
40+
41+ all_tokens = [token for tokens in df .tokens .tolist () for token in tokens ]
42+ common_tokens = set ([token for token , _ in Counter (all_tokens ).most_common (max_vocab )])
43+ df ['tokens' ] = df .tokens .progress_apply (
44+ partial (remove_rare_words , common_tokens = common_tokens , max_len = max_len ),
45+ )
46+
47+ df = df [df .tokens .progress_apply (lambda tokens : any (token != '<UNK>' for token in tokens ))]
48+
49+ df ['clean_text' ] = df .tokens .apply (lambda tokens : ' ' .join (tokens ))
50+
51+ self .text = df .clean_text .tolist ()
52+ self .labels = df .labels .tolist ()
53+
54+ label_encoder = LabelEncoder ()
55+ self .encoded_labels = label_encoder .fit_transform (df ['labels' ])
56+ encoder_name = f"LabelMapping-{ name } .joblib"
57+ encoder_filename = os .path .join ("api" , "encoders" , encoder_name )
58+ os .makedirs (os .path .dirname (encoder_filename ), exist_ok = True )
59+ joblib .dump (label_encoder , encoder_filename )
60+
61+ def remove_rare_words (tokens , common_tokens , max_len ):
62+ return [token if token in common_tokens else '<UNK>' for token in tokens ][- max_len :]
63+
64+ def create_and_train_nb_model (df , name , epochs = 10 , batch_size = 16 , learning_rate = 0.001 , valid_ratio = 0.05 , test_ratio = 0.05 ):
65+ max_vocab = 20000
66+ max_len = 200
67+
68+ dataset = CustomDataset (df , max_vocab , max_len , name )
69+
70+ X_train , X_temp , y_train , y_temp = train_test_split (
71+ dataset .text , dataset .encoded_labels , test_size = valid_ratio + test_ratio , random_state = 42 )
72+ X_valid , X_test , y_valid , y_test = train_test_split (X_temp , y_temp , test_size = test_ratio / (valid_ratio + test_ratio ), random_state = 42 )
73+
74+ # Creating a pipeline with TF-IDF vectorizer and Multinomial Naive Bayes
75+ pipeline = make_pipeline (TfidfVectorizer (max_features = max_vocab ), MultinomialNB ())
76+
77+ # Fitting the model to the training data
78+ pipeline .fit (X_train , y_train )
79+
80+ # Evaluating on validation set
81+ valid_preds = pipeline .predict (X_valid )
82+ valid_acc = accuracy_score (y_valid , valid_preds )
83+ print (f'Validation Accuracy: { valid_acc :.2f} ' )
84+ print (f'Validation Report:\n { classification_report (y_valid , valid_preds )} ' )
85+
86+ # Evaluating on test set
87+ test_preds = pipeline .predict (X_test )
88+ test_acc = accuracy_score (y_test , test_preds )
89+ print (f'Test Accuracy: { test_acc :.2f} ' )
90+ print (f'Test Report:\n { classification_report (y_test , test_preds )} ' )
91+
92+ # Saving the pipeline to a file
93+ model_path = os .path .join ('api' , 'models' , f"{ name } _pipeline.pkl" )
94+ os .makedirs (os .path .dirname (model_path ), exist_ok = True )
95+ with open (model_path , "wb" ) as model_file :
96+ pickle .dump (pipeline , model_file )
97+
98+ training_progress = {
99+ 'training_progress' : 0 ,
100+ 'training_in_progress' : False
101+ }
102+ with open ('training_progress.json' , 'w' ) as file :
103+ json .dump (training_progress , file )
0 commit comments