diff --git a/source/custom_scripts/dump_query_results.py b/source/custom_scripts/dump_query_results.py index abc76f3..90e9981 100644 --- a/source/custom_scripts/dump_query_results.py +++ b/source/custom_scripts/dump_query_results.py @@ -30,22 +30,10 @@ def dump_query_results(): query = "use " + metrics cursor.execute(query) - # CHANGE QUERY HERE - # query = "select username, display_name, email, orcid, kb_internal_user, institution, country, signup_date, last_signin_date from user_info order by signup_date" - # Query for Adam Narratives dump of information: - # select wc.* from metrics.user_info ui inner join metrics_reporting.workspaces_current wc on ui.username = wc.username - # where ui.kb_internal_user = 0 and wc.narrative_version > 0 and is_deleted = 0 and is_temporary = 0; - #query = ("select * from metrics_reporting.narrative_app_flows") - query = ("select * from metrics_reporting.user_super_summary") - # CHANGE COLUMN HEADERS HERE TO MATCH QUERY HEADERS - # print("username\temail\tlast_signin_date\tmax_last_seen\tHasBeenSeen") - # print("ws_id\tusername\tmod_date\tinitial_save_date\trecord_date\ttop_lvl_object_count\ttotal_object_count\tvisible_app_cells_count\tnarrative_version\thidden_object_count\tdeleted_object_count\ttotal_size\ttop_lvl_size\tis_public\tis_temporary\tnumber_of_shares") - # Headers for Adam's narratives query (Note if more columns added, may need to update this - # print( - # "ws_id\tusername\tmod_date\tinitial_save_date\trecord_date\ttop_lvl_object_count\ttotal_object_count\tvisible_app_cells_count\tcode_cells_count\t" - # "narrative_version\thidden_object_count\tdeleted_object_count\ttotal_size\ttop_lvl_size\tis_public\tis_temporary\tis_deleted\tnumber_of_shares\t" - # "num_nar_obj_ids\tstatic_narratives_count" - # ) + # CHANGE QUERIES AND HEADERS HERE + + # USER SUPER SUMMARY + query = ("select * from metrics_reporting.user_super_summary") # HEADERS FOR user_super_summary print( "username\tdisplay_name\temail\tkb_internal_user\tuser_id\tglobus_login\tgoogle_login\torcid\tsession_info_country\tcountry\tstate\t" @@ -58,9 +46,43 @@ def dump_query_results(): "total_apps_run_last90\ttotal_apps_run_last30\ttotal_app_errors_all_time\tfirst_app_run\tlast_app_run\ttotal_run_time_hours\t" "total_queue_time_hours\ttotal_CPU_hours\tsession_count_all_time\tsession_count_last_year\tsession_count_last_90\tsession_count_last_30" ) - #Header for Adam's narrative_app_flow + + # APP FLOWS - for Adam's narrative_app_flow + #query = ("select * from metrics_reporting.narrative_app_flows") #print("ws_id\tusername\tapp_name\tfunc_name\tstart_date\tfinish_date") + + # app popularity growth + #query = ("select uau.app_name, DATE_FORMAT(`finish_date`,'%Y-%m') as run_month, count(*) as run_count, sum(run_time)/3600 as total_run_hours\ + # from metrics.user_app_usage uau inner join metrics.user_info ui on uau.username = ui.username\ + # where ui.kb_internal_user = 0\ + # group by uau.app_name, run_month\ + # order by run_month, app_name") + #print("app_name\trun_month\trun_count\ttotal_run_hours") + + # App category run totals + #query = ("select uau.app_name,\ + # IFNULL(app_category, \"No Category Association\") as app_cat,\ + # DATE_FORMAT(`finish_date`,'%Y-%m') as run_month, count(*) as run_count,\ + # sum(run_time)/3600 as total_run_hours\ + # from metrics.user_app_usage uau inner join\ + # metrics.user_info ui on uau.username = ui.username\ + # left outer join\ + # metrics.app_name_category_map anm on uau.app_name = anm.app_name\ + # where ui.kb_internal_user = 0\ + # group by uau.app_name, app_cat, run_month\ + # order by run_month, app_name;") + #print("app_name\tapp_cat\trun_month\trun_count\ttotal_run_hours") + # USER SESSION STATS: + #query = ("select si.username, count(*) as session_count, sum(estimated_hrs_active) total_hours_active,\ + # avg(estimated_hrs_active) avg_hours_active, std(estimated_hrs_active) std_hours_active,\ + # min(first_seen), max(last_seen)\ + # from metrics.user_info ui inner join metrics.session_info si on ui.username = si.username\ + # where estimated_hrs_active < 24\ + # group by username\ + # order by avg_hours_active desc, session_count, total_hours_active") + #print("username\tsession_count\ttotal_hours_active\tavg_hours_active\tstd_hours_active\tfirst_seen\tlast_seen") + cursor.execute(query) row_values = list() diff --git a/source/monthly_cron_jobs/backfill_static_views_for_doi_metrics.py b/source/monthly_cron_jobs/backfill_static_views_for_doi_metrics.py new file mode 100644 index 0000000..4b410ea --- /dev/null +++ b/source/monthly_cron_jobs/backfill_static_views_for_doi_metrics.py @@ -0,0 +1,189 @@ +import os +import requests +import mysql.connector as mysqlmetrics_mysql_password = os.environ["METRICS_MYSQL_PWD"] + +sql_host = os.environ["SQL_HOST"] +query_on = os.environ["QUERY_ON"] + +requests.packages.urllib3.disable_warnings() + +kb_google_analytics_url = os.environ["KB_GOOGLE_ANALYTICS_URL"] + +def get_kbase_google_analytics(): + """ + Gets the kbase_google_analytics + """ + params = (("tqx", "out:csv"), ("sheet", "Monthly")) + response = requests.get(kb_google_analytics_url, params=params) + if response.status_code != 200: + print( + "ERROR - KBase Google analytics GOOGLE SHEET RESPONSE STATUS CODE : " + + str(response.status_code) + ) + print( + "KBase Google analytics." + ) + return 0 + + # key ws_id -> year -> month -> monthly_page_view + static_narrative_view_monthly_stats = dict() + + lines = response.text.split("\n") + i = 1; + found_header_line = False + for line in lines: + line_elements = line.split(",") + first_element = line_elements[0][1:-1].strip() + if found_header_line: + print("in if") + landing_page_elements = first_element.split("/") + ws_id = int(landing_page_elements[2]) + print("ws_id :" + str(ws_id)) + year = int(line_elements[1][1:-1].strip()) + print("year :" + str(year)) + month = int(line_elements[2][1:-1].strip()) + print("month :" + str(month)) + page_views = int(line_elements[3][1:-1].strip()) + print("page_views :" + str(page_views)) + + if ws_id not in static_narrative_view_monthly_stats: + static_narrative_view_monthly_stats[ws_id] = dict() + if year not in static_narrative_view_monthly_stats[ws_id]: + static_narrative_view_monthly_stats[ws_id][year] = dict() + if month not in static_narrative_view_monthly_stats[ws_id][year]: + static_narrative_view_monthly_stats[ws_id][year][month] = 0 + static_narrative_view_monthly_stats[ws_id][year][month] = static_narrative_view_monthly_stats[ws_id][year][month] + page_views + print(str(i) + " :: " + line) + i += 1 + elif first_element == "Landing Page": + found_header_line = True + + print(str(static_narrative_view_monthly_stats)) + print("Length static_narrative_view_monthly_stats : " + str(len(static_narrative_view_monthly_stats))) + + static_narrative_view_summary_stats = dict() + for ws_id in static_narrative_view_monthly_stats: + running_total_page_views = 0 + if ws_id not in static_narrative_view_summary_stats: + static_narrative_view_summary_stats[ws_id] = dict() + for year in sorted(static_narrative_view_monthly_stats[ws_id]): + if year not in static_narrative_view_summary_stats[ws_id]: + static_narrative_view_summary_stats[ws_id][year] = dict() + for month in sorted(static_narrative_view_monthly_stats[ws_id][year]): + running_total_page_views = running_total_page_views + static_narrative_view_monthly_stats[ws_id][year][month] + static_narrative_view_summary_stats[ws_id][year][month] = running_total_page_views + + print(str(static_narrative_view_summary_stats)) + print("Length static_narrative_view_summary_stats : " + str(len(static_narrative_view_summary_stats))) + + +##################################### + + years_to_do = [2020,2021,2022,2023] + months_to_do = [1,2,3,4,5,6,7,8,9,10,11,12] + static_narrative_view_complete_stats = dict() + for ws_id in static_narrative_view_monthly_stats: + running_total_page_views = 0 + if ws_id not in static_narrative_view_complete_stats: + static_narrative_view_complete_stats[ws_id] = dict() + for year in years_to_do: +# if year in static_narrative_view_monthly_stats[ws_id]: + if year not in static_narrative_view_complete_stats[ws_id]: + static_narrative_view_complete_stats[ws_id][year] = dict() + for month in months_to_do: + if year not in static_narrative_view_monthly_stats[ws_id] or month not in static_narrative_view_monthly_stats[ws_id][year]: + static_narrative_view_complete_stats[ws_id][year][month] = dict() + if year in static_narrative_view_monthly_stats[ws_id] and month in static_narrative_view_monthly_stats[ws_id][year]: + running_total_page_views = running_total_page_views + static_narrative_view_monthly_stats[ws_id][year][month] + static_narrative_view_complete_stats[ws_id][year][month] = running_total_page_views + + print(str(static_narrative_view_complete_stats)) + print("Length static_narrative_view_complete_stats : " + str(len(static_narrative_view_complete_stats))) + + +########################## + + # connect to mysql + db_connection = mysql.connect( + host=sql_host, user="metrics", passwd=metrics_mysql_password, database="metrics" + ) + + cursor = db_connection.cursor() + query = "use " + query_on + cursor.execute(query) + + existing_doi_metrics_dict = dict() + get_existing_doi_metrics_statement = ( + "select ws_id, DATE_FORMAT(`record_date`,'%Y') as year, DATE_FORMAT(`record_date`,'%m') as month " + "from doi_metrics") + cursor.execute(get_existing_doi_metrics_statement) + for ( ws_id, year, month) in cursor: + if ws_id not in existing_doi_metrics_dict: + existing_doi_metrics_dict[ws_id] = dict() + if int(year) not in existing_doi_metrics_dict[ws_id]: + existing_doi_metrics_dict[ws_id][int(year)] = set() + existing_doi_metrics_dict[ws_id][int(year)].add(int(month)) + + update_prep_cursor = db_connection.cursor(prepared=True) + + udate_narratives_views_statement = ( + "update metrics.doi_metrics set static_narrative_views = %s " + "where ws_id = %s and DATE_FORMAT(`record_date`,'%Y-%m') = %s;" + ) + + updates_performed = 0 + + print("WS ID: 133260 " + str(existing_doi_metrics_dict[133260])) + + for ws_id in static_narrative_view_complete_stats: + for year in static_narrative_view_complete_stats[ws_id]: + for month in static_narrative_view_complete_stats[ws_id][year]: + temp_month = month + 1 + temp_year = year + if temp_month == 13: + temp_month = 1 + temp_year = year + 1 + month_input = str(temp_month) + if temp_month < 10: + month_input = "0" + str(temp_month) + date_used = str(temp_year) + "-" + month_input + if ws_id in existing_doi_metrics_dict: + if ws_id == 133260: + print("FOUND WS ID: 133260 " + str(existing_doi_metrics_dict[133260])) + if temp_year in existing_doi_metrics_dict[ws_id]: + if ws_id == 133260: + print("FOUND WS ID: 133260 YEAR : " + str(temp_year) + "::" + str(existing_doi_metrics_dict[133260])) + if temp_month in existing_doi_metrics_dict[ws_id][temp_year]: + if ws_id == 133260: + print("FOUND WS ID: 133260 YEAR : " + str(temp_year) + "::MONTH " + str(temp_month) + "::" + str(existing_doi_metrics_dict[133260])) + # Do update statement + input = ( + static_narrative_view_complete_stats[ws_id][year][month], + ws_id, + date_used, + ) + print("udate_narratives_views_statement : " + udate_narratives_views_statement) + print("input : " + str(input)) + update_prep_cursor.execute(udate_narratives_views_statement, input) + updates_performed += 1 + + db_connection.commit() + + print("Total updates performed: " + str(updates_performed)) + + return 1 + +get_kbase_google_analytics() diff --git a/sql_create_statements/sql_reporting_views_and_tables.sql b/sql_create_statements/sql_reporting_views_and_tables.sql index 3212809..cf72a3a 100644 --- a/sql_create_statements/sql_reporting_views_and_tables.sql +++ b/sql_create_statements/sql_reporting_views_and_tables.sql @@ -1592,3 +1592,17 @@ dmc.derived_object_count, dmc.copied_only_object_count, dmc.fully_derived_object from metrics.doi_ws_map dwm inner join metrics_reporting.doi_metrics_current dmc on dwm.ws_id =dmc.ws_id order by dwm.doi_url, is_parent_ws desc); + + +create or replace view metrics_reporting.doi_metrics_current_report +as ( +select dwm.doi_url AS doi_url, dwm.title AS title, dwm.is_parent_ws AS is_parent_ws, +dmc.ws_id AS ws_id, dmc.record_date AS record_date, dmc.unique_users_count AS unique_users_count, dmc.unique_ws_ids_count AS unique_ws_ids_count, +dmc.ttl_dls_cnt AS ttl_dls_cnt, dmc.ttl_uniq_dl_users_cnt AS ttl_uniq_dl_users_cnt, dmc.ttl_dl_user_doi_obj_cnt AS ttl_dl_user_doi_obj_cnt, +dmc.ttl_dl_users_dled_obj_cnt AS ttl_dl_users_dled_obj_cnt, dmc.derived_object_count AS derived_object_count, +dmc.copied_only_object_count AS copied_only_object_count, dmc.fully_derived_object_pair_counts AS fully_derived_object_pair_counts, +wc.static_narratives_views +from metrics.doi_ws_map dwm inner join +metrics_reporting.doi_metrics_current dmc on dwm.ws_id = dmc.ws_id +inner join metrics_reporting.workspaces_current wc on dmc.ws_id = wc.ws_id +order by dwm.doi_url,dwm.is_parent_ws desc);