Skip to content

Commit

Permalink
feat(analyse): affichage des ccs supportés et non supportés sur les c…
Browse files Browse the repository at this point in the history
…ontributions (#226)
  • Loading branch information
maxgfr authored Aug 14, 2024
1 parent 5f0ce57 commit 59d5a43
Showing 1 changed file with 213 additions and 0 deletions.
213 changes: 213 additions & 0 deletions explorations/selection_cc_contributions.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IDCCs traitées et non traitées pour les pages `contributions`\n",
"\n",
"Dans cette exploration, le but est de récupérer pour chaque contribution générique, la liste des IDCCs sélectionnés"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Chargement des librairies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from src.elasticsearch_connector import ElasticsearchConnector\n",
"\n",
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('display.max_rows', 5000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Récupération des queries sur elasticsearch"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"es_connector = ElasticsearchConnector(env='monolog')\n",
"\n",
"QUERY_LOG_CONTRIB = {\n",
" \"query\": {\n",
" \"bool\": { \n",
" \"must\": [\n",
" {\n",
" \"prefix\": {\n",
" \"url\": \"https://code.travail.gouv.fr/contribution\" \n",
" }\n",
" },\n",
" {\n",
" \"range\": {\n",
" \"logfile\": {\n",
" \"gte\": \"2024-05-01\",\n",
" \"lte\": \"2024-08-01\"\n",
" }\n",
" }\n",
" }\n",
" ]\n",
" }\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"logs = es_connector.execute_query(QUERY_LOG_CONTRIB, \"logs-new\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Vue d'ensemble"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Récupération des logs des urls de contribution génériques\n",
"logs_generic = logs[~logs[\"url\"].str.contains(r\"contribution/\\d{1,4}-\", regex=True)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"logs_generic_cc_select_traitée_et_non_traitée = logs_generic[\n",
" (logs_generic[\"type\"] == \"cc_select_non_traitée\") | \n",
" (logs_generic[\"type\"] == \"cc_select_traitée\")\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"logs_generic_cc_select_traitée_et_non_traitée[\"cleaned_url\"] = logs_generic_cc_select_traitée_et_non_traitée[\"url\"].str.split('#').str[0].str.split('?').str[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"grouped = logs_generic_cc_select_traitée_et_non_traitée.groupby(['cleaned_url', 'idCc', 'type']).size().reset_index(name='count')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pré-calcul des filtres\n",
"traitée_filter = logs_generic_cc_select_traitée_et_non_traitée[\"type\"] == \"cc_select_traitée\"\n",
"non_traitée_filter = logs_generic_cc_select_traitée_et_non_traitée[\"type\"] == \"cc_select_non_traitée\"\n",
"\n",
"# Calcul des totaux\n",
"cc_select_traitée_total = logs_generic_cc_select_traitée_et_non_traitée[traitée_filter].shape[0]\n",
"cc_select_non_traitée_total = logs_generic_cc_select_traitée_et_non_traitée[non_traitée_filter].shape[0]\n",
"\n",
"data = []\n",
"\n",
"for url, group in grouped.groupby('cleaned_url'):\n",
" # Filtrer les logs pour l'url actuelle\n",
" url_filter = logs_generic_cc_select_traitée_et_non_traitée[\"cleaned_url\"] == url\n",
" nb_visits = logs_generic_cc_select_traitée_et_non_traitée[url_filter].shape[0]\n",
" \n",
" for _, row in group.iterrows():\n",
" cc = row['idCc']\n",
" type = row['type']\n",
" count = row['count']\n",
" \n",
" data.append({\n",
" 'url': url,\n",
" 'cc': cc,\n",
" 'type': type,\n",
" 'nb_events': count,\n",
" 'nb_visits': nb_visits,\n",
" 'nb_events_sur_nb_visites': count / nb_visits * 100,\n",
" 'cc_select_traitée_total': cc_select_traitée_total,\n",
" 'cc_select_non_traitée_total': cc_select_non_traitée_total,\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"contribution_generic_cc_select_traitée_et_non_traitée.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0 comments on commit 59d5a43

Please sign in to comment.