Skip to content

Commit 555c137

Browse files
initial commit from previous repo
1 parent 21e8ca3 commit 555c137

16 files changed

+294
-1
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
target/
3+
dbt_packages/
4+
logs/

README.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,29 @@
1-
# dbt-ga4
1+
DBT guide to package creation: https://docs.getdbt.com/docs/guides/building-packages
2+
DBT project structure notes: https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355
3+
4+
To connect to BigQuery using OAuth, see instructions here: https://docs.getdbt.com/reference/warehouse-profiles/bigquery-profile#local-oauth-gcloud-setup
5+
6+
# Configuration Instructions
7+
8+
Create the following variables scoped to the ga4 package in your dbt_project.yml
9+
- project (defaults to "bigquery-public-data")
10+
- dataset (defaults to "ga4_obfuscated_sample_ecommerce")
11+
12+
# TODO
13+
14+
- Macro to extract hostname from URL
15+
- Create staging tables for the following events:
16+
- scroll
17+
- first_visit
18+
- view_promotion
19+
- click
20+
- add_to_cart
21+
- purchase
22+
- Full event reference: https://developers.google.com/analytics/devguides/collection/ga4/reference/events
23+
24+
- Create stg_sessions model
25+
- Create stg_users model
26+
27+
- Recreate Fivetran ga3 models with ga4 data
28+
29+
- Convert basic unnesting operations into macros

analyses/.gitkeep

Whitespace-only changes.

dbt_project.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: 'ga4'
2+
version: '1.0.0'
3+
config-version: 2
4+
profile: 'ga4'
5+
model-paths: ["models"]
6+
analysis-paths: ["analyses"]
7+
test-paths: ["tests"]
8+
seed-paths: ["seeds"]
9+
macro-paths: ["macros"]
10+
snapshot-paths: ["snapshots"]
11+
12+
target-path: "target" # directory which will store compiled SQL files
13+
clean-targets: # directories to be removed by `dbt clean`
14+
- "target"
15+
- "dbt_packages"
16+
17+
vars:
18+
start_date: "20201230" # Defines the earliest GA4 _TABLE_SUFFIX to load into base events model. 20201230 produces about 1GB of data scanned.
19+
project: "bigquery-public-data"
20+
dataset: "ga4_obfuscated_sample_ecommerce"
21+
22+
# Configuring models
23+
# Full documentation: https://docs.getdbt.com/docs/configuring-models
24+
25+
# In this example config, we tell dbt to build all models in the example/ directory
26+
# as tables. These settings can be overridden in the individual model files
27+
# using the `{{ config(...) }}` macro.
28+
models:
29+
ga4:
30+
+materialized: view

macros/.gitkeep

Whitespace-only changes.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
--BigQuery does not cache wildcard queries that scan across sharded tables which means it's best to materialize the raw event data as a partitioned table so that future queries benefit from caching
2+
{{
3+
config(
4+
materialized = 'incremental',
5+
incremental_strategy = 'insert_overwrite',
6+
partition_by={
7+
"field": "event_date_dt",
8+
"data_type": "date",
9+
}
10+
)
11+
}}
12+
13+
with source as (
14+
select *
15+
from {{ source('ga4', 'events') }}
16+
where cast(_table_suffix as int64) >= {{var('start_date')}}
17+
{% if is_incremental() %}
18+
-- Incrementally add new events. Filters on _TABLE_SUFFIX using the max event_date_dt value found
19+
-- See https://docs.getdbt.com/reference/resource-configs/bigquery-configs#the-insert_overwrite-strategy
20+
and parse_date('%Y%m%d',_TABLE_SUFFIX) >= _dbt_max_partition
21+
{% endif %}
22+
),
23+
renamed as (
24+
select
25+
parse_date('%Y%m%d',event_date) as event_date_dt,
26+
*
27+
EXCEPT (event_date) -- remove event date to ensure usage of event_date_dt which is partitioned
28+
from source
29+
)
30+
31+
select * from renamed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
-- If jobs are running daily, it may make sense to avoid the scanning necessary to determine the max partition date. Instead, a static incremental range can be set and this data will be overwritten/inserted at every incremental run.
2+
3+
{% set partitions_to_replace = [
4+
'current_date()',
5+
'date_sub(current_date(), interval 1 day)'
6+
] %}
7+
8+
--BigQuery does not cache wildcard queries that scan across sharded tables which means it's best to materialize the raw event data as a partitioned table so that future queries benefit from caching
9+
{{
10+
config(
11+
materialized = 'incremental',
12+
incremental_strategy = 'insert_overwrite',
13+
partition_by={
14+
"field": "event_date_dt",
15+
"data_type": "date",
16+
}
17+
)
18+
}}
19+
20+
with source as (
21+
select *
22+
from {{ source('ga4', 'events') }}
23+
where cast(_table_suffix as int64) >= {{var('start_date')}}
24+
{% if is_incremental() %}
25+
-- recalculate yesterday + today
26+
and parse_date('%Y%m%d',_TABLE_SUFFIX) in ({{ partitions_to_replace | join(',') }})
27+
{% endif %}
28+
),
29+
renamed as (
30+
select
31+
parse_date('%Y%m%d',event_date) as event_date_dt,
32+
*
33+
EXCEPT (event_date) -- remove event date to ensure usage of event_date_dt which is partitioned
34+
from source
35+
)
36+
37+
select * from renamed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
with page_view_with_params as (
2+
select
3+
event_date_dt,
4+
user_id,
5+
user_pseudo_id,
6+
event_timestamp,
7+
event_name, params,
8+
traffic_source
9+
FROM {{ref('base_ga4__events')}},
10+
UNNEST(event_params) as params
11+
where event_name = 'page_view'
12+
and (params.key = 'page_location' or
13+
params.key = 'ga_session_id' or
14+
params.key = 'ga_session_number' or
15+
params.key = 'entrances' or
16+
params.key = 'page_title' or
17+
params.key = 'page_referrer')
18+
),
19+
pivoted as (
20+
select
21+
event_date_dt,
22+
user_pseudo_id,
23+
user_id,
24+
event_timestamp,
25+
event_name,
26+
traffic_source.name as traffic_source_name,
27+
traffic_source.source as traffic_source_source,
28+
traffic_source.medium as traffic_source_medium,
29+
MAX(if(params.key = "page_location", params.value.string_value, NULL)) as page_location,
30+
MAX(if(params.key = "ga_session_id", params.value.int_value, NULL)) as ga_session_id,
31+
MAX(if(params.key = "ga_session_number", params.value.int_value, NULL)) as ga_session_number,
32+
MAX(if(params.key = "entrances", params.value.int_value, 0)) as entrances,
33+
MAX(if(params.key = "page_title", params.value.string_value, NULL)) as page_title,
34+
MAX(if(params.key = "page_referrer", params.value.string_value, NULL)) as page_referrer
35+
36+
from page_view_with_params
37+
group by 1,2,3,4,5,6,7,8
38+
)
39+
40+
select
41+
*,
42+
case
43+
when ga_session_number = 1 then TRUE
44+
else FALSE
45+
end as is_new_user
46+
from pivoted
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
with purchase_with_params as (
2+
select
3+
event_date_dt,
4+
user_id,
5+
user_pseudo_id,
6+
event_timestamp,
7+
event_name, params,
8+
traffic_source
9+
FROM {{ref('base_ga4__events')}},
10+
UNNEST(event_params) as params
11+
where event_name = 'purchase' -- Pull only 'purchase' events
12+
and
13+
(
14+
params.key = 'page_location' or
15+
params.key = 'ga_session_id' or
16+
params.key = 'ga_session_number' or
17+
params.key = 'page_referrer' or
18+
params.key = 'currency' or
19+
params.key = 'value' or
20+
params.key = 'payment_type' or
21+
params.key = 'coupon' or
22+
params.key = 'transaction_id'
23+
)
24+
),
25+
pivoted as (
26+
select
27+
event_date_dt,
28+
user_pseudo_id,
29+
user_id,
30+
event_timestamp,
31+
event_name,
32+
traffic_source.name as traffic_source_name,
33+
traffic_source.source as traffic_source_source,
34+
traffic_source.medium as traffic_source_medium,
35+
MAX(if(params.key = "page_location", params.value.string_value, NULL)) as page_location,
36+
MAX(if(params.key = "ga_session_id", params.value.int_value, NULL)) as ga_session_id,
37+
MAX(if(params.key = "ga_session_number", params.value.int_value, NULL)) as ga_session_number,
38+
MAX(if(params.key = "page_referrer", params.value.string_value, NULL)) as page_referrer,
39+
MAX(if(params.key = "coupon", params.value.string_value, NULL)) as coupon,
40+
MAX(if(params.key = "transaction_id", params.value.string_value, NULL)) as transaction_id,
41+
MAX(if(params.key = "currency", params.value.string_value, NULL)) as currency,
42+
MAX(if(params.key = "payment_type", params.value.string_value, NULL)) as payment_type,
43+
MAX(if(params.key = "value", params.value.float_value, NULL)) as value
44+
-- TODO how to handle items array?
45+
46+
from purchase_with_params
47+
group by 1,2,3,4,5,6,7,8
48+
)
49+
50+
select
51+
*,
52+
case
53+
when ga_session_number = 1 then TRUE
54+
else FALSE
55+
end as is_new_user
56+
from pivoted

0 commit comments

Comments
 (0)