0

I'm learning Airflow and AWS. I have a simple dag that extracts tweets from a specific location, loads a csv into S3 bucket and then copies that csv into a Redshift Table. Airflow UI marks all tasks completed with no errors. The csv is in my S3 bucket but I can't see the data in my Redshift query editor nor in DBeaver.

Here's my dag file:

from datetime import datetime, timedelta
from airflow import DAG
from airflow.providers.postgres.operators.postgres import PostgresOperator
from airflow.operators.python_operator import PythonOperator
import tweepy as tw
import pandas as pd
import boto3
import json
from sqlalchemy import create_engine
import psycopg2



default_args = {
    'owner':'emas',
    'retries':5,
    'retry_delay':timedelta(minutes=5)    
}

def jsonify_tweepy(tweepy_object):
    json_str = json.dumps(tweepy_object._json)
    return json.loads(json_str)

def run_tw_etl():
    df2 = pd.DataFrame()
    auth = tw.OAuthHandler('asdas', 'asdas')
    auth.set_access_token('asdas-asdas', 'asdas')
    api = tw.API(auth, wait_on_rate_limit=True)
    tweets = tw.Cursor(api.search_tweets, q="-filter:retweets", tweet_mode='extended', geocode="-34.610226,-58.444815,10km").items(1)

    for tweet in tweets:
        aux = jsonify_tweepy(tweet)
        df1 = pd.json_normalize(aux)[['created_at', 'id', 'user.id', 'full_text', 'favorite_count', 'retweet_count']]
        df1['created_at'] = pd.to_datetime(df1['created_at']).dt.to_period("d")
        df1['id'] = df1['id'].astype(str)
        df1['user.id'] = df1['user.id'].astype(str)
        df1['full_text'] = df1['full_text'].astype(str)
        df1['favorite_count'] = df1['favorite_count'].astype(str)
        df1['retweet_count'] = df1['retweet_count'].astype(str)
        df1.rename(columns={'id':'tweet_id', 'user.id':'user_id', 'full_text':'tweet'}, inplace=True)
        
        df2 = df2.append(df1)

    df2.to_csv("/opt/airflow/dags/tweets.csv", index=False)


def upload_to_s3(file_name, s3_bucket, s3_key):
    s3 = boto3.client("s3", aws_access_key_id="asdas", aws_secret_access_key="asdas+asdas")
    s3.upload_file(
        file_name,
        s3_bucket,
        s3_key
    )

def copy_to_rs():
    conn_string = "dbname='{}' port='{}' user='{}' password='{}' host='{}'"\
        .format('dev','5439','esteban_m', 'asdas', 'asdas')
    print(conn_string)
    con = psycopg2.connect(conn_string)
    cur = con.cursor() 
    sqlQuery='''copy public.tweet
    from 's3://mi-bucket-esteban/tweets_06_15.csv'
    iam_role 'arn:aws:iam::asdas:role/service-role/AmazonRedshift-CommandsAccessRole-asdas'
    delimiter ','
    IGNOREHEADER as 1
    csv;'''
    print(sqlQuery)
    cur.execute(sqlQuery)
    


with DAG(
    default_args=default_args,
    dag_id='dag_twitter_v6',
    start_date=datetime(2023, 6, 15),
    schedule_interval='0 0 * * *'
) as dag:
   
    task1 = PythonOperator(
        task_id = "tw_etl",
        python_callable=run_tw_etl
    )

    task2 = PythonOperator(
        task_id = "upload_to_s3",
        python_callable= upload_to_s3,
        op_args = ['/opt/airflow/dags/tweets.csv', 'mi-bucket-esteban', "tweets_06_15.csv"]
    )

    task3 = PythonOperator(
        task_id = "copy_to_rs",
        python_callable = copy_to_rs,

    )


    task1 >> task2 >> task3
  • I think you cam use this to get some help https://stackoverflow.com/questions/15601704/copying-data-from-s3-to-aws-redshift-using-python-and-psycopg2 – Code run Jun 20 '23 at 19:41

0 Answers0