[chore] Refactored API to match torchFastText

meilame-tayebjee · meilame-tayebjee · commit 811cc65e624c · 2025-04-02T15:41:43.000Z
- Added torch / torchFasttext dependencies
- Used dataloaders for inference
- Small fixes in inputs and response processing (for batch it returns a list of Response)
- load "mappings" from S3
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
     "pydantic>=2.11.1",
     "requests>=2.32.3",
     "s3fs>=2025.3.2",
+    "torch>=2.6.0",
+    "torchfasttext",
     "tqdm>=4.67.1",
     "unidecode>=1.3.8",
     "uvicorn>=0.34.0",
@@ -35,3 +37,6 @@ line-length = 130
 
 [tool.uv]
 default-groups = ["dev"]
+
+[tool.uv.sources]
+torchfasttext = { git = "https://github.com/InseeFrLab/torch-fastText.git", branch = "dataset-api" }
diff --git a/setup.sh b/setup.sh
@@ -1,9 +1,10 @@
 #!/bin/bash
 git config --global credential.helper store
 
-pip install -r requirements.txt
-pip install pre-commit
-pre-commit install
+pip install uv
+uv sync
+uv run pre-commit install
+uv run -m nltk.downloader stopwords
 
 AWS_ACCESS_KEY_ID=`vault kv get -field=ACCESS_KEY onyxia-kv/projet-ape/s3` && export AWS_ACCESS_KEY_ID
 AWS_SECRET_ACCESS_KEY=`vault kv get -field=SECRET_KEY onyxia-kv/projet-ape/s3` && export AWS_SECRET_ACCESS_KEY
diff --git a/src/api/constants/models.py b/src/api/constants/models.py
@@ -1,3 +1,3 @@
-VALID_TYPE_FORM = {"A", "B", "C", "D", "E", "G", "I", "L", "M", "N", "P", "R", "S", "X", "Y", "Z"}
-VALID_SURFACE = {"1", "2", "3", "4"}
-VALID_ACTIV_PERM = {"P", "S"}
+VALID_TYPE_FORM = {"A", "B", "C", "D", "E", "G", "I", "L", "M", "N", "P", "R", "S", "X", "Y", "Z", "NaN"}
+VALID_SURFACE = {"1", "2", "3", "4", "NaN"}
+VALID_ACTIV_PERM = {"P", "S", "NaN"}
diff --git a/src/api/main.py b/src/api/main.py
@@ -26,14 +26,7 @@ async def lifespan(app: FastAPI):
     logger.info("🚀 Starting API lifespan")
 
     model_uri = f"models:/{os.environ['MLFLOW_MODEL_NAME']}/{os.environ['MLFLOW_MODEL_VERSION']}"
-    app.state.model = mlflow.pyfunc.load_model(model_uri)
-    run_params = mlflow.get_run(app.state.model.metadata.run_id).data.params
-
-    app.state.training_names = [
-        run_params["text_feature"],
-        *(v for k, v in run_params.items() if k.startswith("textual_features")),
-        *(v for k, v in run_params.items() if k.startswith("categorical_features")),
-    ]
+    app.state.model = mlflow.pytorch.load_model(model_uri)
 
     libs_path = Path("api/data/libs.yaml")
     app.state.libs = yaml.safe_load(libs_path.read_text())
diff --git a/src/api/routes/predict_batch.py b/src/api/routes/predict_batch.py
@@ -1,19 +1,27 @@
-from typing import Annotated
+from typing import Annotated, List
 
+import numpy as np
+import torch
 from fastapi import APIRouter, Depends, Request
 from fastapi.security import HTTPBasicCredentials
+from torchFastText.datasets import FastTextModelDataset
 
 from api.models.forms import BatchForms
 from api.models.responses import PredictionResponse
 from utils.logging import log_prediction
 from utils.prediction import process_response
-from utils.preprocessing import preprocess_inputs
+from utils.preprocessing import categorical_features, mappings, preprocess_inputs, text_feature
 from utils.security import get_credentials
 
+router = APIRouter(prefix="/single", tags=["Predict an activity"])
+
+APE_NIV5_MAPPING = mappings["nace2025"]
+INV_APE_NIV5_MAPPING = {v: k for k, v in APE_NIV5_MAPPING.items()}
+
 router = APIRouter(prefix="/batch", tags=["Predict a batch of activity"])
 
 
-@router.post("/predict", response_model=PredictionResponse)
+@router.post("/predict", response_model=List[PredictionResponse])
 async def predict(
     credentials: Annotated[HTTPBasicCredentials, Depends(get_credentials)],
     request: Request,
@@ -33,13 +41,32 @@ async def predict(
     Returns:
         list: The list of predicted responses.
     """
-    query = preprocess_inputs(request.app.state.training_names, forms.forms)
+    query = preprocess_inputs(forms.forms)
+
+    text, categorical_variables = (
+        query[text_feature].values,
+        query[categorical_features].values,
+    )
+
+    dataset = FastTextModelDataset(
+        texts=text,
+        categorical_variables=categorical_variables,
+        tokenizer=request.app.state.model.model.tokenizer,
+    )
+
+    batch_size = len(text) if len(text) < 256 else 256
+    dataloader = dataset.create_dataloader(batch_size=batch_size, shuffle=False, num_workers=12)
+
+    batch = next(iter(dataloader))
+    scores = request.app.state.model(batch).detach()
+    probs = torch.nn.functional.softmax(scores, dim=1)
+    sorted_probs, sorted_probs_indices = probs.sort(descending=True, axis=1)
 
-    predictions = request.app.state.model.predict(query, params={"k": nb_echos_max})
+    predicted_class = sorted_probs_indices[:, :nb_echos_max].numpy()
+    predicted_probs = sorted_probs[:, :nb_echos_max].numpy()
 
-    response = [
-        process_response(predictions, i, nb_echos_max, prob_min, request.app.state.libs) for i in range(len(predictions[0]))
-    ]
+    predicted_class = np.vectorize(INV_APE_NIV5_MAPPING.get)(predicted_class)
+    predictions = (predicted_class, predicted_probs)
 
     responses = []
     for i in range(len(predictions[0])):
diff --git a/src/api/routes/predict_single.py b/src/api/routes/predict_single.py
@@ -1,17 +1,23 @@
 from typing import Annotated
 
+import numpy as np
+import torch
 from fastapi import APIRouter, Depends, Request
 from fastapi.security import HTTPBasicCredentials
+from torchFastText.datasets import FastTextModelDataset
 
 from api.models.forms import SingleForm
 from api.models.responses import PredictionResponse
 from utils.logging import log_prediction
 from utils.prediction import process_response
-from utils.preprocessing import preprocess_inputs
+from utils.preprocessing import categorical_features, mappings, preprocess_inputs, text_feature
 from utils.security import get_credentials
 
 router = APIRouter(prefix="/single", tags=["Predict an activity"])
 
+APE_NIV5_MAPPING = mappings["nace2025"]
+INV_APE_NIV5_MAPPING = {v: k for k, v in APE_NIV5_MAPPING.items()}
+
 
 @router.post("/predict", response_model=PredictionResponse)
 async def predict(
@@ -35,9 +41,30 @@ async def predict(
         dict: Response containing APE codes.
     """
 
-    query = preprocess_inputs(request.app.state.training_names, [form])
+    query = preprocess_inputs([form])
+
+    text, categorical_variables = (
+        query[text_feature].values,
+        query[categorical_features].values,
+    )
+
+    dataset = FastTextModelDataset(
+        texts=text,
+        categorical_variables=categorical_variables,
+        tokenizer=request.app.state.model.model.tokenizer,
+    )
+    dataloader = dataset.create_dataloader(batch_size=1, shuffle=False, num_workers=1)
+
+    batch = next(iter(dataloader))
+    scores = request.app.state.model(batch).detach()
+    probs = torch.nn.functional.softmax(scores, dim=1)
+    sorted_probs, sorted_probs_indices = probs.sort(descending=True, axis=1)
+
+    predicted_class = sorted_probs_indices[:, :nb_echos_max].numpy()
+    predicted_probs = sorted_probs[:, :nb_echos_max].numpy()
 
-    predictions = request.app.state.model.predict(query, params={"k": max(2, nb_echos_max)})
+    predicted_class = np.vectorize(INV_APE_NIV5_MAPPING.get)(predicted_class)
+    predictions = (predicted_class, predicted_probs)
 
     response = process_response(predictions, 0, nb_echos_max, prob_min, request.app.state.libs)
 
diff --git a/src/utils/prediction.py b/src/utils/prediction.py
@@ -18,11 +18,13 @@ def process_response(
     pred_labels = labels[liasse_nb]
     pred_probs = probs[liasse_nb]
 
-    valid_predictions = [
-        (label.replace("__label__", ""), prob) for label, prob in zip(pred_labels, pred_probs) if prob >= prob_min
-    ][:nb_echos_max]
+    valid_preds = []
+    mask = pred_probs >= prob_min
+    valid_predicted_class = pred_labels[mask]
+    valid_predicted_confidence = pred_probs[mask]
+    valid_preds.append(tuple(zip(valid_predicted_class, valid_predicted_confidence)))
 
-    if not valid_predictions:
+    if not valid_preds:
         raise HTTPException(
             status_code=400,
             detail="No prediction exceeds the minimum probability threshold.",
@@ -34,10 +36,10 @@ def process_response(
             probabilite=float(prob),
             libelle=libs[label],
         )
-        for i, (label, prob) in enumerate(valid_predictions)
+        for i, (label, prob) in enumerate(valid_preds[0])
     }
 
-    ic = response_data["1"].probabilite - float(pred_probs[1])
-    response_data["IC"] = ic
+    confidence_score = pred_probs[0] - pred_probs[1]
+    response_data["IC"] = confidence_score
 
     return PredictionResponse(response_data)
diff --git a/src/utils/preprocessing.py b/src/utils/preprocessing.py
diff --git a/uv.lock b/uv.lock