Skip to content

Commit 59d5a43

Browse files
authored
feat(analyse): affichage des ccs supportés et non supportés sur les contributions (#226)
1 parent 5f0ce57 commit 59d5a43

File tree

1 file changed

+213
-0
lines changed

1 file changed

+213
-0
lines changed
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# IDCCs traitées et non traitées pour les pages `contributions`\n",
8+
"\n",
9+
"Dans cette exploration, le but est de récupérer pour chaque contribution générique, la liste des IDCCs sélectionnés"
10+
]
11+
},
12+
{
13+
"cell_type": "markdown",
14+
"metadata": {},
15+
"source": [
16+
"## 1. Chargement des librairies"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": null,
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"import pandas as pd\n",
26+
"from src.elasticsearch_connector import ElasticsearchConnector\n",
27+
"\n",
28+
"pd.set_option('display.max_columns', None)\n",
29+
"pd.set_option('display.max_rows', 5000)"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## 2. Récupération des queries sur elasticsearch"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"es_connector = ElasticsearchConnector(env='monolog')\n",
46+
"\n",
47+
"QUERY_LOG_CONTRIB = {\n",
48+
" \"query\": {\n",
49+
" \"bool\": { \n",
50+
" \"must\": [\n",
51+
" {\n",
52+
" \"prefix\": {\n",
53+
" \"url\": \"https://code.travail.gouv.fr/contribution\" \n",
54+
" }\n",
55+
" },\n",
56+
" {\n",
57+
" \"range\": {\n",
58+
" \"logfile\": {\n",
59+
" \"gte\": \"2024-05-01\",\n",
60+
" \"lte\": \"2024-08-01\"\n",
61+
" }\n",
62+
" }\n",
63+
" }\n",
64+
" ]\n",
65+
" }\n",
66+
" }\n",
67+
"}"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"logs = es_connector.execute_query(QUERY_LOG_CONTRIB, \"logs-new\")"
77+
]
78+
},
79+
{
80+
"cell_type": "markdown",
81+
"metadata": {},
82+
"source": [
83+
"## 3. Vue d'ensemble"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": null,
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"# Récupération des logs des urls de contribution génériques\n",
93+
"logs_generic = logs[~logs[\"url\"].str.contains(r\"contribution/\\d{1,4}-\", regex=True)]"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"metadata": {},
100+
"outputs": [],
101+
"source": [
102+
"logs_generic_cc_select_traitée_et_non_traitée = logs_generic[\n",
103+
" (logs_generic[\"type\"] == \"cc_select_non_traitée\") | \n",
104+
" (logs_generic[\"type\"] == \"cc_select_traitée\")\n",
105+
"]"
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": null,
111+
"metadata": {},
112+
"outputs": [],
113+
"source": [
114+
"logs_generic_cc_select_traitée_et_non_traitée[\"cleaned_url\"] = logs_generic_cc_select_traitée_et_non_traitée[\"url\"].str.split('#').str[0].str.split('?').str[0]"
115+
]
116+
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": null,
120+
"metadata": {},
121+
"outputs": [],
122+
"source": [
123+
"grouped = logs_generic_cc_select_traitée_et_non_traitée.groupby(['cleaned_url', 'idCc', 'type']).size().reset_index(name='count')"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": null,
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"# Pré-calcul des filtres\n",
133+
"traitée_filter = logs_generic_cc_select_traitée_et_non_traitée[\"type\"] == \"cc_select_traitée\"\n",
134+
"non_traitée_filter = logs_generic_cc_select_traitée_et_non_traitée[\"type\"] == \"cc_select_non_traitée\"\n",
135+
"\n",
136+
"# Calcul des totaux\n",
137+
"cc_select_traitée_total = logs_generic_cc_select_traitée_et_non_traitée[traitée_filter].shape[0]\n",
138+
"cc_select_non_traitée_total = logs_generic_cc_select_traitée_et_non_traitée[non_traitée_filter].shape[0]\n",
139+
"\n",
140+
"data = []\n",
141+
"\n",
142+
"for url, group in grouped.groupby('cleaned_url'):\n",
143+
" # Filtrer les logs pour l'url actuelle\n",
144+
" url_filter = logs_generic_cc_select_traitée_et_non_traitée[\"cleaned_url\"] == url\n",
145+
" nb_visits = logs_generic_cc_select_traitée_et_non_traitée[url_filter].shape[0]\n",
146+
" \n",
147+
" for _, row in group.iterrows():\n",
148+
" cc = row['idCc']\n",
149+
" type = row['type']\n",
150+
" count = row['count']\n",
151+
" \n",
152+
" data.append({\n",
153+
" 'url': url,\n",
154+
" 'cc': cc,\n",
155+
" 'type': type,\n",
156+
" 'nb_events': count,\n",
157+
" 'nb_visits': nb_visits,\n",
158+
" 'nb_events_sur_nb_visites': count / nb_visits * 100,\n",
159+
" 'cc_select_traitée_total': cc_select_traitée_total,\n",
160+
" 'cc_select_non_traitée_total': cc_select_non_traitée_total,\n",
161+
" })"
162+
]
163+
},
164+
{
165+
"cell_type": "code",
166+
"execution_count": null,
167+
"metadata": {},
168+
"outputs": [],
169+
"source": [
170+
"df = pd.DataFrame(data)"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": null,
176+
"metadata": {},
177+
"outputs": [],
178+
"source": [
179+
"df"
180+
]
181+
},
182+
{
183+
"cell_type": "code",
184+
"execution_count": null,
185+
"metadata": {},
186+
"outputs": [],
187+
"source": [
188+
"df.to_csv(\"contribution_generic_cc_select_traitée_et_non_traitée.csv\", index=False)"
189+
]
190+
}
191+
],
192+
"metadata": {
193+
"kernelspec": {
194+
"display_name": "Python 3 (ipykernel)",
195+
"language": "python",
196+
"name": "python3"
197+
},
198+
"language_info": {
199+
"codemirror_mode": {
200+
"name": "ipython",
201+
"version": 3
202+
},
203+
"file_extension": ".py",
204+
"mimetype": "text/x-python",
205+
"name": "python",
206+
"nbconvert_exporter": "python",
207+
"pygments_lexer": "ipython3",
208+
"version": "3.9.6"
209+
}
210+
},
211+
"nbformat": 4,
212+
"nbformat_minor": 4
213+
}

0 commit comments

Comments
 (0)