-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcomm_extraction.py
140 lines (113 loc) · 4.44 KB
/
comm_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys
import json
from os import path
from datetime import datetime, timedelta
import logging
import time
from airflow import DAG
from airflow.decorators import dag, task, task_group
from airflow.operators.python import get_current_context
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from airflow.providers.http.operators.http import SimpleHttpOperator
# HACK: Fix for loading relative modules.
sys.path.append(path.dirname(path.realpath(__file__)))
from tasks.airbyte import fetch_airbyte_connections_tg
from providers.airbyte.operator import AirbyteTriggerSyncOperator
from tasks.alerting import send_alert_discord
from tasks.config import INTERNAL_ENV
"""
DAG to sync data from the Discourse forums used accros the org
"""
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
ARGS = {
'owner': 'apentori',
'depends_on_past': False,
'start_date': datetime(2024,2,20),
'email': ['alexis@status.im'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=10),
'catchup': False,
'on_failure_callback': send_alert_discord,
}
airbyte_connections=[
'discord_fetcher',
'simplecast_fetch',
'twitter-nomos',
'twitter-codex',
'twitter-logos',
'twitter-status',
'twitter-nimbus',
'twitter-waku',
'twitter-vac',
'twitter-keycard',
'twitter-ift',
'twitter-operator',
]
@task(task_id="wait_for_api")
def wait_for_api():
# Twitter API limit number of call each 15 min
# https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets#tab1
time.sleep(900)
# Task group to add a waiting time for the API to avoid reaching the API rate limit
@task_group(group_id='fetch_twitter_info')
def fetch_twitter_info(connection_id, airb_conn_name):
twitter_fetch = AirbyteTriggerSyncOperator(
task_id='airbyte_fetch_twittter_'+airb_conn_name,
airbyte_conn_id='airbyte_conn',
connection_id=connection_id,
asynchronous=False,
wait_seconds=3,
trigger_rule="all_done"
)
twitter_fetch >> wait_for_api()
@dag(
'comm_extraction',
default_args=ARGS,
schedule_interval='0 */24 * * * '
)
def comm_extraction():
connections_id=fetch_airbyte_connections_tg(airbyte_connections)
# Trigger Airbyte fetch Data from Discourse
discord_fetcher = AirbyteTriggerSyncOperator(
task_id='airbyte_fetch_discord',
airbyte_conn_id='airbyte_conn',
connection_id=connections_id['discord_fetcher'],
asynchronous=False,
wait_seconds=3
)
simplecast_fetch = AirbyteTriggerSyncOperator(
task_id='airbyte_fetch_simplecast',
airbyte_conn_id='airbyte_conn',
connection_id=connections_id['simplecast_fetch'],
asynchronous=False,
wait_seconds=3
)
#twitter_acid_info = fetch_twitter_info(connections_id['twitter-acid'], 'acid_info')
twitter_nomos_tech = fetch_twitter_info(connections_id['twitter-nomos'], 'nomos')
twitter_codex = fetch_twitter_info(connections_id['twitter-codex'], 'codex')
twitter_logos = fetch_twitter_info(connections_id['twitter-logos'], 'logos')
twitter_status = fetch_twitter_info(connections_id['twitter-status'], 'status')
twitter_nimbus = fetch_twitter_info(connections_id['twitter-nimbus'], 'nimbus')
twitter_waku = fetch_twitter_info(connections_id['twitter-waku'], 'waku')
twitter_vac = fetch_twitter_info(connections_id['twitter-vac'], 'vac')
twitter_keycard = fetch_twitter_info(connections_id['twitter-keycard'], 'keycard')
twitter_ift = fetch_twitter_info(connections_id['twitter-ift'], 'ift')
twitter_operator = fetch_twitter_info(connections_id['twitter-operator'], 'operator')
dbt_run_discord = BashOperator(
task_id='dbt_run_models_discord',
bash_command='dbt run --select discord',
env=INTERNAL_ENV,
append_env=True
)
dbt_run_twitter = BashOperator(
task_id='dbt_run_models_twitter',
bash_command='dbt run --select twitter',
env=INTERNAL_ENV,
append_env=True
)
# Twitter connections have to be sequentially run to avoid API Rate Limits
connections_id >> [discord_fetcher, simplecast_fetch] >> twitter_nomos_tech >> twitter_codex >> twitter_logos >> twitter_waku >> twitter_nimbus >> twitter_vac >> twitter_keycard >> twitter_operator >> twitter_ift >> dbt_run_discord >> dbt_run_twitter
comm_extraction()