I'm learning Airflow and AWS. I have a simple dag that extracts tweets from a specific location, loads a csv into S3 bucket and then copies that csv into a Redshift Table. Airflow UI marks all tasks completed with no errors. The csv is in my S3 bucket but I can't see the data in my Redshift query editor nor in DBeaver.
Here's my dag file:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.operators.python_operator import PythonOperator
import tweepy as tw
import pandas as pd
import boto3
import json
from sqlalchemy import create_engine
import psycopg2
default_args = {
'owner':'emas',
'retries':5,
'retry_delay':timedelta(minutes=5)
}
def jsonify_tweepy(tweepy_object):
json_str = json.dumps(tweepy_object._json)
return json.loads(json_str)
def run_tw_etl():
df2 = pd.DataFrame()
auth = tw.OAuthHandler('asdas', 'asdas')
auth.set_access_token('asdas-asdas', 'asdas')
api = tw.API(auth, wait_on_rate_limit=True)
tweets = tw.Cursor(api.search_tweets, q="-filter:retweets", tweet_mode='extended', geocode="-34.610226,-58.444815,10km").items(1)
for tweet in tweets:
aux = jsonify_tweepy(tweet)
df1 = pd.json_normalize(aux)[['created_at', 'id', 'user.id', 'full_text', 'favorite_count', 'retweet_count']]
df1['created_at'] = pd.to_datetime(df1['created_at']).dt.to_period("d")
df1['id'] = df1['id'].astype(str)
df1['user.id'] = df1['user.id'].astype(str)
df1['full_text'] = df1['full_text'].astype(str)
df1['favorite_count'] = df1['favorite_count'].astype(str)
df1['retweet_count'] = df1['retweet_count'].astype(str)
df1.rename(columns={'id':'tweet_id', 'user.id':'user_id', 'full_text':'tweet'}, inplace=True)
df2 = df2.append(df1)
df2.to_csv("/opt/airflow/dags/tweets.csv", index=False)
def upload_to_s3(file_name, s3_bucket, s3_key):
s3 = boto3.client("s3", aws_access_key_id="asdas", aws_secret_access_key="asdas+asdas")
s3.upload_file(
file_name,
s3_bucket,
s3_key
)
def copy_to_rs():
conn_string = "dbname='{}' port='{}' user='{}' password='{}' host='{}'"\
.format('dev','5439','esteban_m', 'asdas', 'asdas')
print(conn_string)
con = psycopg2.connect(conn_string)
cur = con.cursor()
sqlQuery='''copy public.tweet
from 's3://mi-bucket-esteban/tweets_06_15.csv'
iam_role 'arn:aws:iam::asdas:role/service-role/AmazonRedshift-CommandsAccessRole-asdas'
delimiter ','
IGNOREHEADER as 1
csv;'''
print(sqlQuery)
cur.execute(sqlQuery)
with DAG(
default_args=default_args,
dag_id='dag_twitter_v6',
start_date=datetime(2023, 6, 15),
schedule_interval='0 0 * * *'
) as dag:
task1 = PythonOperator(
task_id = "tw_etl",
python_callable=run_tw_etl
)
task2 = PythonOperator(
task_id = "upload_to_s3",
python_callable= upload_to_s3,
op_args = ['/opt/airflow/dags/tweets.csv', 'mi-bucket-esteban', "tweets_06_15.csv"]
)
task3 = PythonOperator(
task_id = "copy_to_rs",
python_callable = copy_to_rs,
)
task1 >> task2 >> task3