From 08ea9e065fa65ecf84129f218871687a0fb6a59a Mon Sep 17 00:00:00 2001 From: Carl Cervone <42869436+ccerv1@users.noreply.github.com> Date: Mon, 30 Dec 2024 08:13:31 -0700 Subject: [PATCH] Update user model and network graph tutorial (#2686) * feat(dbt): add github users to users model * refactor(docs): network graph queries --- apps/docs/docs/tutorials/network-graph.md | 373 +++++++++--------- .../models/intermediate/users/int_users.sql | 21 + 2 files changed, 207 insertions(+), 187 deletions(-) diff --git a/apps/docs/docs/tutorials/network-graph.md b/apps/docs/docs/tutorials/network-graph.md index 9bef16009..a46dfcf93 100644 --- a/apps/docs/docs/tutorials/network-graph.md +++ b/apps/docs/docs/tutorials/network-graph.md @@ -26,54 +26,56 @@ We'll start by identifying projects with significant onchain activity on chains - Activity on specific chains (`OPTIMISM`, `BASE`, `MODE`) ```sql -WITH relevant_projects AS ( - SELECT +with relevant_projects as ( + select project_id, project_name, - SUM(gas_fees_sum_6_months) as gas_fees, - SUM(transaction_count_6_months) as txns, - SUM(address_count_90_days) as users - FROM `oso_production.onchain_metrics_by_project_v1` - WHERE event_source IN ('OPTIMISM', 'BASE', 'MODE') - GROUP BY 1, 2 - HAVING txns > 1000 AND users > 420 + sum(gas_fees_sum_6_months) as gas_fees, + sum(transaction_count_6_months) as txns, + sum(address_count_90_days) as users + from `oso_production.onchain_metrics_by_project_v1` + where event_source in ('OPTIMISM', 'BASE', 'MODE') + group by 1, 2 + having txns > 1000 and users > 420 ) ``` ### Fetch relevant repositories -Next, we identify repositories related to these projects by joining with the `oso_production.int_repo_metrics_by_project` table. We filter for repositories using specific programming languages (`TypeScript`, `Solidity`, `Rust`). +Next, we identify repositories related to these projects by joining with the `oso_production.repositories_v0` table. We filter for repositories using specific programming languages (`TypeScript`, `Solidity`, `Rust`). ```sql -relevant_repos AS ( - SELECT rm.artifact_id, p.project_name, p.project_id - FROM `oso_production.int_repo_metrics_by_project` rm - JOIN relevant_projects p ON rm.project_id = p.project_id - WHERE rm.language IN ('TypeScript', 'Solidity', 'Rust') +relevant_repos as ( + select rm.artifact_id, p.project_name, p.project_id + from `oso_production.repositories_v0` rm + join relevant_projects p on rm.project_id = p.project_id + where rm.language in ('TypeScript', 'Solidity', 'Rust') ) ``` ### Identify core developers -We then identify core developers who have made significant contributions to these repositories. This involves querying the `oso_production.int_events__github` table for developers who: +We then identify core developers who have made significant contributions to these repositories. This involves querying the `oso_production.timeseries_events_by_artifact_v0` table for developers who: - Have committed code (`event_type = 'COMMIT_CODE'`) - Are not bots - Have contributed over multiple months and with a minimum amount ```sql -core_devs AS ( - SELECT DISTINCT - from_artifact_id as developer_id, - from_artifact_name as developer_name, - to_artifact_id as repo_id - FROM `oso_production.int_events__github` - WHERE to_artifact_id IN (SELECT artifact_id FROM relevant_repos) - AND event_type = 'COMMIT_CODE' - AND from_artifact_name NOT LIKE '%[bot]%' - GROUP BY 1, 2, 3 - HAVING COUNT(DISTINCT date_trunc(`time`, MONTH)) >= 3 - AND SUM(amount) >= 20 +core_devs as ( + select + e.from_artifact_id as developer_id, + u.display_name as developer_name, + e.to_artifact_id as repo_id + from `oso_production.timeseries_events_by_artifact_v0` e + join `oso_production.users_v1` u + on e.from_artifact_id = u.user_id + where e.to_artifact_id in (select artifact_id from relevant_repos) + and e.event_type = 'COMMIT_CODE' + and u.display_name not like '%[bot]%' + group by 1,2,3 + having count(distinct date_trunc(time, month)) >= 3 + and sum(amount) >= 20 ) ``` @@ -82,10 +84,10 @@ core_devs AS ( We focus on target repositories that have published releases by querying the `oso_production.int_events__github` table for `RELEASE_PUBLISHED` events. ```sql -repos_with_releases AS ( - SELECT DISTINCT to_artifact_id - FROM `oso_production.int_events__github` - WHERE event_type = 'RELEASE_PUBLISHED' +repos_with_releases as ( + select distinct to_artifact_id + from `oso_production.timeseries_events_by_artifact_v0` + where event_type = 'RELEASE_PUBLISHED' ) ``` @@ -97,31 +99,31 @@ Finally, we track interactions of core developers with other repositories. We jo - Target project interactions (days, amount, types) ```sql -dev_other_repos AS ( - SELECT +dev_other_repos as ( + select cd.developer_id, cd.developer_name, rr.project_name as source_project, rp.gas_fees as source_project_gas_fees, rp.txns as source_project_txns, rp.users as source_project_users, - e.to_artifact_namespace, - e.to_artifact_name, + target_rm.artifact_namespace, + target_rm.artifact_name, target_p.project_name as target_project_name, - COUNT(DISTINCT date_trunc(e.time, DAY)) as target_project_interaction_days_from_dev, - SUM(e.amount) as target_project_interaction_amount_from_dev, - COUNT(DISTINCT e.event_type) as target_project_interaction_types_distinct - FROM core_devs cd - JOIN relevant_repos rr ON cd.repo_id = rr.artifact_id - JOIN relevant_projects rp ON rr.project_id = rp.project_id - JOIN `oso_production.int_events__github` e ON cd.developer_id = e.from_artifact_id - LEFT JOIN `oso_production.int_repo_metrics_by_project` target_rm ON e.to_artifact_id = target_rm.artifact_id - LEFT JOIN `oso_production.projects_v1` target_p ON target_rm.project_id = target_p.project_id - WHERE e.to_artifact_id NOT IN (SELECT artifact_id FROM relevant_repos) - AND e.to_artifact_id IN (SELECT to_artifact_id FROM repos_with_releases) - AND e.time >= '2023-01-01' - GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9 - HAVING target_project_interaction_days_from_dev >= 1 + count(distinct date_trunc(e.time, day)) as target_project_interaction_days_from_dev, + sum(e.amount) as target_project_interaction_amount_from_dev, + count(distinct e.event_type) as target_project_interaction_types_distinct + from core_devs cd + join relevant_repos rr on cd.repo_id = rr.artifact_id + join relevant_projects rp on rr.project_id = rp.project_id + join `oso_production.timeseries_events_by_artifact_v0` e on cd.developer_id = e.from_artifact_id + left join `oso_production.repositories_v0` target_rm on e.to_artifact_id = target_rm.artifact_id + left join `oso_production.projects_v1` target_p on target_rm.project_id = target_p.project_id + where e.to_artifact_id not in (select artifact_id from relevant_repos) + and e.to_artifact_id in (select to_artifact_id from repos_with_releases) + and e.time >= '2023-01-01' + group by 1,2,3,4,5,6,7,8,9 + having count(distinct date_trunc(e.time, day)) >= 1 ) ``` @@ -130,9 +132,9 @@ dev_other_repos AS ( The final selection filters out interactions where the source and target projects are the same and orders the results by interaction days. ```sql -SELECT * FROM dev_other_repos -WHERE source_project != target_project_name -ORDER BY target_project_interaction_days_from_dev DESC +select * from dev_other_repos +where source_project != target_project_name +order by target_project_interaction_days_from_dev desc ``` ### Run the full query @@ -141,88 +143,86 @@ Here's the full query for the network graph: ```sql -- Step 1: Identify relevant projects with sufficient activity -WITH relevant_projects AS ( - SELECT - project_id, - project_name, - SUM(gas_fees_sum_6_months) AS gas_fees, - SUM(transaction_count_6_months) AS txns, - SUM(address_count_90_days) AS users - FROM `oso_production.onchain_metrics_by_project_v1` - WHERE event_source IN ('OPTIMISM', 'BASE', 'MODE') - GROUP BY project_id, project_name - HAVING txns > 1000 AND users > 420 +with relevant_projects as ( + select + project_id, + project_name, + sum(gas_fees_sum_6_months) as gas_fees, + sum(transaction_count_6_months) as txns, + sum(address_count_90_days) as users + from `oso_production.onchain_metrics_by_project_v1` + where event_source in ('OPTIMISM', 'BASE', 'MODE') + group by project_id, project_name + having txns > 1000 and users > 420 ), -- Step 2: Fetch repositories related to the relevant projects -relevant_repos AS ( - SELECT - rm.artifact_id, - p.project_name, - p.project_id - FROM `oso_production.int_repo_metrics_by_project` rm - JOIN relevant_projects p ON rm.project_id = p.project_id - WHERE rm.language IN ('TypeScript', 'Solidity', 'Rust') +relevant_repos as ( + select + rm.artifact_id, + p.project_name, + p.project_id + from `oso_production.repositories_v0` rm + join relevant_projects p on rm.project_id = p.project_id + where rm.language in ('TypeScript', 'Solidity', 'Rust') ), -- Step 3: Identify core developers with significant contributions -core_devs AS ( - SELECT DISTINCT - from_artifact_id AS developer_id, - from_artifact_name AS developer_name, - to_artifact_id AS repo_id - FROM `oso_production.int_events__github` - WHERE to_artifact_id IN (SELECT artifact_id FROM relevant_repos) - AND event_type = 'COMMIT_CODE' - AND from_artifact_name NOT LIKE '%[bot]%' - GROUP BY developer_id, developer_name, repo_id - HAVING COUNT(DISTINCT DATE_TRUNC(time, MONTH)) >= 3 - AND SUM(amount) >= 20 +core_devs as ( + select + e.from_artifact_id as developer_id, + u.display_name as developer_name, + e.to_artifact_id as repo_id + from `oso_production.timeseries_events_by_artifact_v0` e + join `oso_production.users_v1` u + on e.from_artifact_id = u.user_id + where e.to_artifact_id in (select artifact_id from relevant_repos) + and e.event_type = 'COMMIT_CODE' + and u.display_name not like '%[bot]%' + group by 1,2,3 + having count(distinct date_trunc(time, month)) >= 3 + and sum(amount) >= 20 ), -- Step 4: Identify repositories that published releases -repos_with_releases AS ( - SELECT DISTINCT to_artifact_id - FROM `oso_production.int_events__github` - WHERE event_type = 'RELEASE_PUBLISHED' +repos_with_releases as ( + select distinct to_artifact_id + from `oso_production.timeseries_events_by_artifact_v0` + where event_type = 'RELEASE_PUBLISHED' ), -- Step 5: Fetch interactions of core developers in other repositories -dev_other_repos AS ( - SELECT - cd.developer_id, - cd.developer_name, - rr.project_name AS source_project, - rp.gas_fees AS source_project_gas_fees, - rp.txns AS source_project_txns, - rp.users AS source_project_users, - e.to_artifact_namespace, - e.to_artifact_name, - target_p.project_name AS target_project_name, - COUNT(DISTINCT DATE_TRUNC(e.time, DAY)) AS target_project_interaction_days_from_dev, - SUM(e.amount) AS target_project_interaction_amount_from_dev, - COUNT(DISTINCT e.event_type) AS target_project_interaction_types_distinct - FROM core_devs cd - JOIN relevant_repos rr ON cd.repo_id = rr.artifact_id - JOIN relevant_projects rp ON rr.project_id = rp.project_id - JOIN `oso_production.int_events__github` e ON cd.developer_id = e.from_artifact_id - LEFT JOIN `oso_production.int_repo_metrics_by_project` target_rm ON e.to_artifact_id = target_rm.artifact_id - LEFT JOIN `oso_production.projects_v1` target_p ON target_rm.project_id = target_p.project_id - WHERE e.to_artifact_id NOT IN (SELECT artifact_id FROM relevant_repos) - AND e.to_artifact_id IN (SELECT to_artifact_id FROM repos_with_releases) - AND e.time >= '2023-01-01' - GROUP BY - cd.developer_id, cd.developer_name, - rr.project_name, rp.gas_fees, rp.txns, rp.users, - e.to_artifact_namespace, e.to_artifact_name, target_project_name - HAVING COUNT(DISTINCT DATE_TRUNC(e.time, DAY)) >= 1 +dev_other_repos as ( + select + cd.developer_id, + cd.developer_name, + rr.project_name as source_project, + rp.gas_fees as source_project_gas_fees, + rp.txns as source_project_txns, + rp.users as source_project_users, + target_rm.artifact_namespace, + target_rm.artifact_name, + target_p.project_name as target_project_name, + count(distinct date_trunc(e.time, day)) as target_project_interaction_days_from_dev, + sum(e.amount) as target_project_interaction_amount_from_dev, + count(distinct e.event_type) as target_project_interaction_types_distinct + from core_devs cd + join relevant_repos rr on cd.repo_id = rr.artifact_id + join relevant_projects rp on rr.project_id = rp.project_id + join `oso_production.timeseries_events_by_artifact_v0` e on cd.developer_id = e.from_artifact_id + left join `oso_production.repositories_v0` target_rm on e.to_artifact_id = target_rm.artifact_id + left join `oso_production.projects_v1` target_p on target_rm.project_id = target_p.project_id + where e.to_artifact_id not in (select artifact_id from relevant_repos) + and e.to_artifact_id in (select to_artifact_id from repos_with_releases) + and e.time >= '2023-01-01' + group by 1,2,3,4,5,6,7,8,9 + having count(distinct date_trunc(e.time, day)) >= 1 ) -- Step 6: Select final results -SELECT * -FROM dev_other_repos -WHERE source_project != target_project_name -ORDER BY target_project_interaction_days_from_dev DESC +select * from dev_other_repos +where source_project != target_project_name +order by target_project_interaction_days_from_dev desc ``` ## Python @@ -251,88 +251,87 @@ To run this query in Python, you can use the following: ```python query = """ -- Step 1: Identify relevant projects with sufficient activity -WITH relevant_projects AS ( - SELECT - project_id, - project_name, - SUM(gas_fees_sum_6_months) AS gas_fees, - SUM(transaction_count_6_months) AS txns, - SUM(address_count_90_days) AS users - FROM `oso_production.onchain_metrics_by_project_v1` - WHERE event_source IN ('OPTIMISM', 'BASE', 'MODE') - GROUP BY project_id, project_name - HAVING txns > 1000 AND users > 420 +with relevant_projects as ( + select + project_id, + project_name, + sum(gas_fees_sum_6_months) as gas_fees, + sum(transaction_count_6_months) as txns, + sum(address_count_90_days) as users + from `oso_production.onchain_metrics_by_project_v1` + where event_source in ('OPTIMISM', 'BASE', 'MODE') + group by project_id, project_name + having txns > 1000 and users > 420 ), -- Step 2: Fetch repositories related to the relevant projects -relevant_repos AS ( - SELECT - rm.artifact_id, - p.project_name, - p.project_id - FROM `oso_production.int_repo_metrics_by_project` rm - JOIN relevant_projects p ON rm.project_id = p.project_id - WHERE rm.language IN ('TypeScript', 'Solidity', 'Rust') +relevant_repos as ( + select + rm.artifact_id, + p.project_name, + p.project_id + from `oso_production.repositories_v0` rm + join relevant_projects p on rm.project_id = p.project_id + where rm.language in ('TypeScript', 'Solidity', 'Rust') ), -- Step 3: Identify core developers with significant contributions -core_devs AS ( - SELECT DISTINCT - from_artifact_id AS developer_id, - from_artifact_name AS developer_name, - to_artifact_id AS repo_id - FROM `oso_production.int_events__github` - WHERE to_artifact_id IN (SELECT artifact_id FROM relevant_repos) - AND event_type = 'COMMIT_CODE' - AND from_artifact_name NOT LIKE '%[bot]%' - GROUP BY developer_id, developer_name, repo_id - HAVING COUNT(DISTINCT DATE_TRUNC(time, MONTH)) >= 3 - AND SUM(amount) >= 20 +core_devs as ( + select + e.from_artifact_id as developer_id, + u.display_name as developer_name, + e.to_artifact_id as repo_id + from `oso_production.timeseries_events_by_artifact_v0` e + join `oso_production.users_v1` u + on e.from_artifact_id = u.user_id + where e.to_artifact_id in (select artifact_id from relevant_repos) + and e.event_type = 'COMMIT_CODE' + and u.display_name not like '%[bot]%' + group by 1,2,3 + having count(distinct date_trunc(time, month)) >= 3 + and sum(amount) >= 20 ), -- Step 4: Identify repositories that published releases -repos_with_releases AS ( - SELECT DISTINCT to_artifact_id - FROM `oso_production.int_events__github` - WHERE event_type = 'RELEASE_PUBLISHED' +repos_with_releases as ( + select distinct to_artifact_id + from `oso_production.timeseries_events_by_artifact_v0` + where event_type = 'RELEASE_PUBLISHED' ), -- Step 5: Fetch interactions of core developers in other repositories -dev_other_repos AS ( - SELECT - cd.developer_id, - cd.developer_name, - rr.project_name AS source_project, - rp.gas_fees AS source_project_gas_fees, - rp.txns AS source_project_txns, - rp.users AS source_project_users, - e.to_artifact_namespace, - e.to_artifact_name, - target_p.project_name AS target_project_name, - COUNT(DISTINCT DATE_TRUNC(e.time, DAY)) AS target_project_interaction_days_from_dev, - SUM(e.amount) AS target_project_interaction_amount_from_dev, - COUNT(DISTINCT e.event_type) AS target_project_interaction_types_distinct - FROM core_devs cd - JOIN relevant_repos rr ON cd.repo_id = rr.artifact_id - JOIN relevant_projects rp ON rr.project_id = rp.project_id - JOIN `oso_production.int_events__github` e ON cd.developer_id = e.from_artifact_id - LEFT JOIN `oso_production.int_repo_metrics_by_project` target_rm ON e.to_artifact_id = target_rm.artifact_id - LEFT JOIN `oso_production.projects_v1` target_p ON target_rm.project_id = target_p.project_id - WHERE e.to_artifact_id NOT IN (SELECT artifact_id FROM relevant_repos) - AND e.to_artifact_id IN (SELECT to_artifact_id FROM repos_with_releases) - AND e.time >= '2023-01-01' - GROUP BY - cd.developer_id, cd.developer_name, - rr.project_name, rp.gas_fees, rp.txns, rp.users, - e.to_artifact_namespace, e.to_artifact_name, target_project_name - HAVING COUNT(DISTINCT DATE_TRUNC(e.time, DAY)) >= 1 +dev_other_repos as ( + select + cd.developer_id, + cd.developer_name, + rr.project_name as source_project, + rp.gas_fees as source_project_gas_fees, + rp.txns as source_project_txns, + rp.users as source_project_users, + target_rm.artifact_namespace, + target_rm.artifact_name, + target_p.project_name as target_project_name, + count(distinct date_trunc(e.time, day)) as target_project_interaction_days_from_dev, + sum(e.amount) as target_project_interaction_amount_from_dev, + count(distinct e.event_type) as target_project_interaction_types_distinct + from core_devs cd + join relevant_repos rr on cd.repo_id = rr.artifact_id + join relevant_projects rp on rr.project_id = rp.project_id + join `oso_production.timeseries_events_by_artifact_v0` e on cd.developer_id = e.from_artifact_id + left join `oso_production.repositories_v0` target_rm on e.to_artifact_id = target_rm.artifact_id + left join `oso_production.projects_v1` target_p on target_rm.project_id = target_p.project_id + where e.to_artifact_id not in (select artifact_id from relevant_repos) + and e.to_artifact_id in (select to_artifact_id from repos_with_releases) + and e.time >= '2023-01-01' + group by 1,2,3,4,5,6,7,8,9 + having count(distinct date_trunc(e.time, day)) >= 1 ) -- Step 6: Select final results -SELECT * -FROM dev_other_repos -WHERE source_project != target_project_name -ORDER BY target_project_interaction_days_from_dev DESC +select * +from dev_other_repos +where source_project != target_project_name +order by target_project_interaction_days_from_dev desc """ diff --git a/warehouse/dbt/models/intermediate/users/int_users.sql b/warehouse/dbt/models/intermediate/users/int_users.sql index 2c5354fc1..a3356309d 100644 --- a/warehouse/dbt/models/intermediate/users/int_users.sql +++ b/warehouse/dbt/models/intermediate/users/int_users.sql @@ -20,8 +20,29 @@ lens_users as ( bio, "" as url from {{ ref('stg_lens__profiles') }} +), + +github_users as ( + select + from_artifact_id as user_id, + from_artifact_source_id as user_source_id, + "GITHUB" as user_source, + display_name, + "" as profile_picture_url, + "" as bio, + "https://github.com/" || display_name as url + from ( + select + from_artifact_id, + from_artifact_source_id, + MAX_BY(LOWER(from_artifact_name), time) as display_name + from {{ ref('int_events__github') }} + group by from_artifact_id, from_artifact_source_id + ) ) select * from farcaster_users union all select * from lens_users +union all +select * from github_users