1

I am new to Airflow. I have a requirement to write a DAG in which I need to pass the sql file. The sql file consists of lot of queries and it uses Big Query tables. It should be scheduled to run once a day around 3 AM PST. Which operators do I need to use for this DAG? Also In the DML there is a variable called event_current_date that is set to get the data for previous day, but it should be parameterized there is a need to run this process for an earlier day.

 DECLARE current_event_date STRING DEFAULT CONCAT(
'",CAST(DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) AS STRING),"' 

Would be great if someone can give me more insight on how to write a DAG that is scheduled at this time and how I can parameterize event_current_date variable. Do I need to parameterized in DAG file or in Sql file?

My sql file contains these queries :-

DECLARE idx, col_cnt, row_cnt, idx_row INT64;
DECLARE col_name, col_flag STRING;
DECLARE cmp_cond,lookup_query, lookup_query_row STRING;
DECLARE col_list ARRAY <STRING>;
DECLARE is_required BOOLEAN;
DECLARE event_names_len, valid_values_len INT64;
DECLARE logic_based_fields STRING; -- this varible is used to hard-coded the rules that are not in the lookup table
DECLARE current_event_date STRING DEFAULT CONCAT("'",CAST(DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) AS STRING),"'");

-- Re-create temp table to get invalid flags fields from base and lookup tables
CREATE OR REPLACE TABLE `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp` AS
   SELECT
         base.column_name,
         base.column_flag,
         base.required_field_flag,
         base.event_names,
         base.valid_values,
         base.field_name,
         base.__row_number,
         DENSE_RANK() OVER(PARTITION BY base.column_name ORDER BY base.__row_number) AS field_rank
      FROM
      (
         SELECT 
            bc.column_name,
            vlk.field_name,
            bc.column_flag,
            vlk.required_field_flag,
            vlk.event_names,
            vlk.valid_values,
            ROW_NUMBER() OVER() AS __row_number
         FROM 
            (SELECT
               column_name as column_flag,
               SUBSTR (column_name, 1, INSTR(column_name, 'is_invalid')-2) column_name
            FROM
               `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.INFORMATION_SCHEMA.COLUMNS`
            WHERE
            table_name = 'st_vix_ott_dev_dq_monitoring_base_test'
            AND column_name LIKE "%is_invalid%"
            ) bc
            INNER JOIN `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_valid_values_lookup_test` vlk
            ON bc.column_name = vlk.field_name
            ORDER BY 1
         ) base
      ORDER BY base.__row_number;

--SELECT * FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`;

-- Set control variables
SET col_cnt = (SELECT COUNT (*) 
               FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`);
SET idx = 1;
SET lookup_query = '';

--build case statements dynamically based on valid values on the lookup table
WHILE idx <= col_cnt DO
   SET (col_flag,col_name) = (
     SELECT AS STRUCT column_flag,column_name 
     FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
     WHERE __row_number = idx
     );

   SET row_cnt = (SELECT count(*) 
                  FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
                  WHERE IFNULL(field_name,'') = col_name );

   IF row_cnt = 1 THEN
         --Check if event_names & valid_values arrays are not empty
      SET (event_names_len, valid_values_len, is_required) = (
            SELECT AS STRUCT ARRAY_LENGTH(event_names),ARRAY_LENGTH(valid_values), required_field_flag 
            FROM  `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
            WHERE field_name = col_name
         );

      --Check event_name + field_name is required + field name has invalid values
      IF (is_required AND event_names_len > 0 AND valid_values_len > 0) THEN
            SET cmp_cond = (
               SELECT CONCAT (",CASE WHEN REGEXP_CONTAINS(LOWER(event_name), '^(",LOWER(ARRAY_TO_STRING(event_names,"|")),
               ")') AND (IFNULL(",col_name,",'') = '' OR LOWER(CAST(", LOWER(col_name), " AS STRING)) NOT IN ('", LOWER(ARRAY_TO_STRING (valid_values, "', '")), 
               "')) THEN true ELSE false END AS ",col_flag)
               FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
               WHERE field_name = col_name
            );
            SET lookup_query = CONCAT(lookup_query, cmp_cond);

         --Check event_name + field_name is required
         ELSEIF (is_required AND event_names_len > 0 AND valid_values_len = 0) THEN
            SET cmp_cond = (
               SELECT CONCAT (",CASE WHEN REGEXP_CONTAINS(LOWER(event_name), '^(",LOWER(ARRAY_TO_STRING(event_names,"|")),")') AND IFNULL("
               ,col_name,",'') = '' THEN true ELSE false END AS ",col_flag)
               FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
               WHERE field_name = col_name
            );
            SET lookup_query = CONCAT(lookup_query, cmp_cond);

         --Check field_name is required      
         ELSEIF (is_required AND event_names_len = 0 AND valid_values_len = 0) THEN
            SET cmp_cond = (
                  SELECT CONCAT (",CASE WHEN IFNULL(", col_name, ",'') = '' THEN true ELSE false END AS ",col_flag)
                  FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
                  WHERE field_name = col_name
            );
            SET lookup_query = CONCAT(lookup_query, cmp_cond);
      END IF;
   
   -- field_name with multiple rows
   ELSEIF row_cnt > 1 THEN
      SET idx_row = 1;
      SET lookup_query_row = '';
         
      WHILE idx_row <= row_cnt DO
         --Check if event_names & valid_values arrays are not empty
         SET (event_names_len, valid_values_len, is_required) = (
               SELECT AS STRUCT ARRAY_LENGTH(event_names),ARRAY_LENGTH(valid_values), required_field_flag 
               FROM  `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
               WHERE column_name = col_name and field_rank = idx_row
            );

         --Check event_name + field_name is required + field name has invalid values
         IF (is_required AND event_names_len > 0 AND valid_values_len > 0) THEN
            SET cmp_cond = (
               SELECT CONCAT ("WHEN REGEXP_CONTAINS(LOWER(event_name), '^(",LOWER(ARRAY_TO_STRING(event_names,"|")),
               ")') AND (IFNULL(",col_name,",'') = '' OR LOWER(CAST(", LOWER(col_name), " AS STRING)) NOT IN ('", LOWER(ARRAY_TO_STRING (valid_values, "', '")), 
               "')) THEN true ")
               FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`
               WHERE column_name = col_name and field_rank = idx_row
            );
            SET lookup_query_row = CONCAT(lookup_query_row, cmp_cond);
         END IF;
         SET idx_row = idx_row + 1;
      END WHILE;
      SET lookup_query = CONCAT(lookup_query,",CASE ", lookup_query_row, "ELSE false END AS ", col_flag);
      SET idx = idx + row_cnt - 1; -- increment to go to the next field
   
   ELSE
      SET cmp_cond = CONCAT(",NULL AS ", col_flag);
      SET lookup_query = CONCAT(lookup_query, cmp_cond); 
   END IF;
   SET idx = idx + 1; --counter main while loop  
END WHILE;

-- This is a workaround due to BQ's dynamic SQL limitations with nested CASE statements
-- These fields aren't in the valid values lookup table
SET logic_based_fields = (SELECT """
                                 ,CASE 
                                       WHEN LOWER(event_name) LIKE '%video%' AND IFNULL(video_id_channel_id_sports_event_id,'') = '' THEN true 
                                    ELSE false END AS video_id_channel_id_sports_event_id_is_invalid_flag
                                 ,CASE 
                                       WHEN LOWER(event_name) LIKE '%video%' 
                                          AND ((IFNULL(navigation_section,'') ='' AND is_epg IS NOT NULL) 
                                             OR (is_epg IS NULL AND IFNULL(navigation_section,'') <>'') 
                                             OR (is_epg = TRUE AND IFNULL(epg_category,'') = '')) THEN true 
                                    ELSE false END AS client_path_sensitive_properties_is_invalid_flag
                                 ,CASE
                                    WHEN LOWER(event_name) = 'video content playing' 
                                       AND (video_heartbeat_value IS NULL OR video_heartbeat_value > 60 OR video_heartbeat_value <= 0) THEN TRUE
                                    ELSE FALSE END AS video_heartbeat_value_is_invalid_flag
                                 ,CASE WHEN LOWER(event_name) LIKE '%video%' THEN 1 ELSE 0 END AS video_event_flag
                           """);


-- Dynamic SQL to create temp table that will be use to insert into base table and invalid values table
EXECUTE IMMEDIATE format("""
   CREATE OR REPLACE TABLE `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_temp` 
   AS
      SELECT
         event_date
         ,anonymous_id
         ,sl.context_segment_source AS platform_name
         ,os_version
         ,event_id
         ,event_name
         ,event_type
         ,stream_type
         ,session_id
         ,stream_id
         ,ip
         ,navigation_section
         ,is_epg
         ,epg_category
         ,screen_id
         ,screen_title
         ,screen_type
         ,video_content_vertical
         ,video_genres_first
         ,video_id_channel_id_sports_event_id
         ,video_id
         ,channel_id
         ,sports_event_id
         ,video_is_kids
         ,video_player_mode
         ,video_title
         ,video_type
         ,video_heartbeat_value
         ,CASE WHEN event_name = 'Video Content Started' THEN true ELSE false END AS event_is_video_start_flag
         %s
         %s
      FROM (
      SELECT
            context_protocols_source_id,
            DATE(original_timestamp) AS event_date,
            id AS event_id,
            original_event_name AS event_name,
            original_event_type AS event_type,
            context_ip AS ip,
            anonymous_id,
            user_id,
            COALESCE(session_id,
            context_screen_properties_session_id) AS session_id,
            screen_id,
            screen_title,
            screen_type,
            stream_id,
            stream_type,
            video_id,
            video_type,
            video_title,
            video_genres_first,
            video_content_vertical,
            video_is_kids,
            video_player_mode,
            video_heartbeat_value,
            channel_id,
            sports_event_id,
            COALESCE(COALESCE(channel_id,video_id),sports_event_id) AS video_id_channel_id_sports_event_id,
            is_epg,
            epg_category_id,
            epg_category,
            navigation_section,
            context_os_version AS os_version,
            ROW_NUMBER() OVER (PARTITION BY id ORDER BY loaded_at DESC) AS __row_number
      FROM
         `st-vix-ott-dev.vix_collapsed_events_dev.master_event`
      WHERE
         DATE(_PARTITIONTIME) = %s 
         AND DATE(original_timestamp) = %s
         ) AS mev
      LEFT JOIN
      `st-vix-ott-dev.st_vix_ott_dev_us_data_master_dataset.st_vix_ott_dev_data_segment_lookup_table` sl 
      ON
      mev.context_protocols_source_id = sl.context_protocols_source_id
      WHERE mev.__row_number = 1
""",
logic_based_fields,
lookup_query,
current_event_date,
current_event_date
);

--Insert into the base and invalid values tables
IF (SELECT COUNT(*) FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_temp`) > 0 THEN

   --Delete current event date data to handle multiple runs in the same day
   EXECUTE IMMEDIATE format(""" 
               DELETE
               FROM
               `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_test`
               WHERE event_date = %s;
   """,
   current_event_date
   );

   EXECUTE IMMEDIATE format("""                
               DELETE
               FROM
               `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_invalid_values_test`
               WHERE event_date = %s;
   """,
   current_event_date
   );

   --Insert into base table
   INSERT INTO `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_test`
   (
      event_date,
      anonymous_id,
      platform_name,
      os_version,
      event_id,
      event_name,
      event_type,
      stream_type,
      session_id,
      stream_id,
      ip,
      navigation_section,
      is_epg,
      epg_category,
      screen_id,
      screen_title,
      screen_type,
      video_content_vertical,
      video_genres_first,
      video_id_channel_id_sports_event_id,
      video_id,
      channel_id,
      sports_event_id,
      video_is_kids,
      video_player_mode,
      video_type,
      anonymous_id_is_invalid_flag,
      client_path_sensitive_properties_is_invalid_flag,
      event_is_video_start_flag,
      ip_is_invalid_flag,
      screen_id_is_invalid_flag,
      screen_title_is_invalid_flag,
      screen_type_is_invalid_flag,
      session_id_is_invalid_flag,
      stream_id_is_invalid_flag,
      stream_type_is_invalid_flag,
      video_heartbeat_value,
      video_content_vertical_is_invalid_flag,
      video_genres_first_is_invalid_flag,
      video_heartbeat_value_is_invalid_flag,
      video_id_channel_id_sports_event_id_is_invalid_flag,
      video_is_kids_is_invalid_flag,
      video_player_mode_is_invalid_flag,
      video_type_is_invalid_flag,
      video_event_flag,
      created_datetime
   )
   SELECT 
      event_date,
      anonymous_id,
      platform_name,
      os_version,
      event_id,
      event_name,
      event_type,
      stream_type,
      session_id,
      stream_id,
      ip,
      navigation_section,
      is_epg,
      epg_category,
      screen_id,
      screen_title,
      screen_type,
      video_content_vertical,
      video_genres_first,
      video_id_channel_id_sports_event_id,
      video_id,
      channel_id,
      sports_event_id,
      video_is_kids,
      video_player_mode,
      video_type,
      anonymous_id_is_invalid_flag,
      client_path_sensitive_properties_is_invalid_flag,
      event_is_video_start_flag,
      ip_is_invalid_flag,
      screen_id_is_invalid_flag,
      screen_title_is_invalid_flag,
      screen_type_is_invalid_flag,
      session_id_is_invalid_flag,
      stream_id_is_invalid_flag,
      stream_type_is_invalid_flag,
      video_heartbeat_value,
      video_content_vertical_is_invalid_flag,
      video_genres_first_is_invalid_flag,
      video_heartbeat_value_is_invalid_flag,
      video_id_channel_id_sports_event_id_is_invalid_flag,
      video_is_kids_is_invalid_flag,
      video_player_mode_is_invalid_flag,
      video_type_is_invalid_flag,
      video_event_flag,
      CURRENT_DATETIME()
   FROM `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_temp`;

   --Insert into invalid value_values tables
   INSERT INTO `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_invalid_values_test`
      (
      event_date,
      anonymous_id,
      platform_name,
      os_version,
      event_name,
      event_type,
      event_invalid_values,
      created_datetime  
      )
      WITH cte_invalid
      AS
      (
      SELECT 
            event_date, 
            anonymous_id,       
            platform_name,
            os_version,     
            event_id,       
            event_name,     
            event_type,         
            CASE WHEN video_id_channel_id_sports_event_id_is_invalid_flag THEN video_id_channel_id_sports_event_id ELSE 'valid' END  AS video_id_channel_id_sports_event_id,
            CASE WHEN video_heartbeat_value_is_invalid_flag THEN cast(video_heartbeat_value as string) ELSE 'valid' END AS video_heartbeat_value,
            CASE WHEN ip_is_invalid_flag THEN ip ELSE 'valid' END  AS ip,
            CASE WHEN screen_id_is_invalid_flag THEN screen_id ELSE 'valid' END  AS screen_id,
            CASE WHEN screen_title_is_invalid_flag THEN screen_title ELSE 'valid' END  AS screen_title,
            CASE WHEN screen_type_is_invalid_flag THEN screen_type ELSE 'valid' END  AS screen_type,
            CASE WHEN session_id_is_invalid_flag THEN session_id ELSE 'valid' END  AS session_id,
            CASE WHEN stream_id_is_invalid_flag THEN stream_id ELSE 'valid' END  AS stream_id,
            CASE WHEN stream_type_is_invalid_flag THEN stream_type ELSE 'valid' END  AS stream_type,
            CASE WHEN video_content_vertical_is_invalid_flag THEN video_content_vertical ELSE 'valid' END  AS video_content_vertical,
            CASE WHEN video_genres_first_is_invalid_flag THEN video_genres_first ELSE 'valid' END  AS video_genres_first,
            CASE WHEN video_is_kids_is_invalid_flag THEN video_is_kids ELSE 'valid' END  AS video_is_kids,
            CASE WHEN video_player_mode_is_invalid_flag THEN video_player_mode ELSE 'valid' END  AS video_player_mode,
            CASE WHEN video_type_is_invalid_flag THEN video_type ELSE 'valid' END  AS video_type,
            CASE WHEN client_path_sensitive_properties_is_invalid_flag THEN navigation_section ELSE 'valid' END  AS navigation_section,
            CASE WHEN client_path_sensitive_properties_is_invalid_flag THEN CAST(is_epg AS STRING) ELSE 'valid' END  AS is_epg,
            CASE WHEN client_path_sensitive_properties_is_invalid_flag THEN epg_category ELSE 'valid' END  AS epg_category
      FROM
         `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_temp`
      WHERE 
            video_id_channel_id_sports_event_id_is_invalid_flag 
            OR client_path_sensitive_properties_is_invalid_flag 
            OR video_heartbeat_value_is_invalid_flag 
            OR anonymous_id_is_invalid_flag 
            OR ip_is_invalid_flag 
            OR screen_id_is_invalid_flag 
            OR screen_title_is_invalid_flag 
            OR screen_type_is_invalid_flag 
            OR session_id_is_invalid_flag 
            OR stream_id_is_invalid_flag 
            OR stream_type_is_invalid_flag 
            OR video_content_vertical_is_invalid_flag 
            OR video_genres_first_is_invalid_flag 
            OR video_is_kids_is_invalid_flag 
            OR video_player_mode_is_invalid_flag 
            OR video_type_is_invalid_flag

      ),
      cte_invalid_agg
      AS
      (SELECT 
            event_date, 
            anonymous_id,       
            platform_name,
            os_version,             
            event_name,     
            event_type,
            event_id,
         ARRAY_AGG(STRUCT(field_name,invalid_field_value)) AS invalid_field_value,
      FROM
      (
         SELECT
            *
         FROM 
         cte_invalid
      ) sl
      UNPIVOT INCLUDE NULLS
      (
         invalid_field_value FOR field_name IN (video_id_channel_id_sports_event_id,
                                                video_heartbeat_value,
                                                ip,
                                                screen_id,
                                                screen_title,
                                                screen_type,
                                                session_id,
                                                stream_id,
                                                stream_type,
                                                video_content_vertical,
                                                video_genres_first,
                                                video_is_kids,
                                                video_player_mode,
                                                video_type,
                                                navigation_section,
                                                is_epg,
                                                epg_category
                                                )
      
      )
      WHERE IFNULL(invalid_field_value,'') <> 'valid'
      GROUP BY       
            event_date, 
            anonymous_id,       
            platform_name,
            os_version,             
            event_name,     
            event_type,
            event_id
      )

      SELECT 
            event_date, 
            anonymous_id,       
            platform_name,
            os_version,             
            event_name,     
            event_type,
            ARRAY_AGG(STRUCT(event_id, invalid_field_value)) AS event_invalid_values,
            CURRENT_DATETIME()
      FROM cte_invalid_agg
      GROUP BY
            event_date, 
            anonymous_id,       
            platform_name,
            os_version,             
            event_name,     
            event_type;

   --Drop temp tables
   DROP TABLE `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_temp`;
   DROP TABLE `st-vix-ott-dev.st_vix_ott_dev_us_data_dq_quality_checks.st_vix_ott_dev_dq_monitoring_base_fields_temp`;
      
END IF

I wrote a DAG and it got triggered and inserted today's value but it has not deleted tables which has written at the end. Can someone check and also how I can schedule it, so that it gets triggered once a day at 3 am PST. Here is my DAG code :-

import datetime
import os
import logging
from airflow import DAG
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from composer_plugins import get_query_content

# Environments variables
event_collection_project_id = os.environ["EVENT_COLLECTION_PROJECT_ID"]
sql_scripts_folder = os.environ["SQL_SCRIPTS_FOLDER"]
QA_CHECK_QUERY= "DMLs/data_qa_checks/DQ_check_base_table_new.sql"


yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()
)

insert_data_from_sql_file = get_query_content(
    sql_scripts_folder,QA_CHECK_QUERY
)

logging.info(f"query: {insert_data_from_sql_file}")

default_dag_args = {
    # Setting start date as yesterday starts the DAG immediately when it is
    # detected in the Cloud Storage bucket.
    "start_date": yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    "email_on_failure": False,
    "email_on_retry": False,
    # If a task fails, retry it once after waiting at least what's specified in retry_delay
    "retries": 1,
    "retry_delay": datetime.timedelta(seconds=10),
    "project_id": event_collection_project_id,
}

with DAG(
        dag_id="data_qa_checks",
        schedule_interval=None,
        default_args=default_dag_args,
) as dag:

    # call the query that will insert the data from sql file and it will do the operations which are mentioned in the query
    DQ_dml = BigQueryInsertJobOperator(
        task_id="DQ_dml",
        job_id="{{ ts_nodash }}-DQ_dml",
        configuration={
            "query": {
                "query": insert_data_from_sql_file,
                "useLegacySql": "False",
            },
        },
        dag=dag,
    )      
Priya
  • 37
  • 7

3 Answers3

0

If your setup for airflow is on ec2/k8s machine then the source(sql file) could be in local(if ec2) or s3 file(if k8s) path.

You can pass the variable(e.g sql file path, or date) at run time using params
https://stackoverflow.com/a/53686174/2986344

And regarding invoking bigquery you can leverage an hook(with modification) https://registry.astronomer.io/dags/xcom-gcs-ds

teedak8s
  • 732
  • 5
  • 12
  • I am using GCP and Airflow. I don't need to install anything – Priya Jun 13 '22 at 03:52
  • No need to install anything other than modification to the bigquery hook(if required) .The reference i shared https://registry.astronomer.io/dags/xcom-gcs-ds (see the code section)and this would serve as reference implementation only. – teedak8s Jun 13 '22 at 03:58
  • I have edited my question and specified SQL file and DAG which I wrote. Can you have a look and let me know whether I am doing any mistake and how to schedule the DAG – Priya Jun 15 '22 at 03:44
0

You can use Airflow's Jinja templating system.

  1. Create a directory in your DAGs folder to store queries used by your DAGs, for example I created {AIRFLOW_HOME}/dags/templates

  2. Create a .sql file in this directory, for example dml_merge.sql.

  3. Paste the contents of your query into this file

 DECLARE current_event_date STRING DEFAULT CONCAT(
'",CAST(DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) AS STRING),"' 

How you handle the variable current_event_date depends on exactly how you're passing this variable into the DAG.

Without your full query it's not clear exactly what you need parameterised in the query, so for an example I will assume you want to pass current_event_date into your query as a variable from the DAG.

Are you triggering the dag with a conf and passing current_event_date into the DAG?

  1. Change current_event_date in dml_merge.sql to access the dag_run (using Airflow's templated syntax {{}}):
 DECLARE {{ dag_run.conf.get('current_event_date') }} STRING DEFAULT CONCAT(
'",CAST(DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) AS STRING),"' 

When you trigger the dag with a conf {"current_event_date": "2022-01-01"} this will be substituted into your query.

You can also use this to access variables/xcoms etc. Any templates referenced in the docs here can be used in your templated sql file as long as you use the {{}} syntax.

Do you need custom logic to work out current_event_date?

  1. Create a custom macro

Define a python function in your dag

def custom_macro():
    current_event_date = "some date" # function to get the `current_event_date`
    return current_event_date

Add this custom macro to the DAG by passing user_defined_macros

@dag(start_date=pendulum.today(tz="Europe/London"), user_defined_macros={"current_event_date": custom_macro})

Change current_event_date in dml_merge.sql by wrapping it in {{}} (as it is now a macro which you've defined). As it's a macro, you need to call it (as it's a function) to distinguish it from a variable:

 DECLARE {{ current_event_date() }} STRING DEFAULT CONCAT(
'",CAST(DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) AS STRING),"' 

When you run your DAG, airflow sees that current_event_date in your .sql is a user defined macro, and will call the function custom_macro which returns a value. You can use this if you need more complicated logic than just passing in the date with a conf or accessing it as a variable/xcom.

Macros are really powerful in Airflow. The Airflow tutorial page on Templating with Jinja is a good place to start if you want to learn more about it here.

  1. Use BigQueryInsertJobOperator to run dml_merge.sql.

If you are using Cloud Composer in GCP then this operator is already available in the environment.

If your Airflow instance isn't on Cloud Composer, you'll need to install apache-airflow-providers-google into the environment. The documentation page for this provider is here.

Once you've done that you can import the operator:

from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator

The documentation for this operator is here.

Finally you can create the task to run dml_merge.sql

run_dml_merge_query = BigQueryInsertJobOperator(
    task_id="run_dml_merge_query",
    configuration={
        "query": {
            "query": "{% include 'templates/dml_merge.sql' %}",
            "useLegacySql": False,
        }
    },
)

Again we are using the Jinja templating system with the {% include %} syntax, this will insert the contents of dml_merge.sql into this query dict for the configuration of the operator.

This pattern only works for fields in operators that are templated fields. You can see that the configuration argument to the Operator is a templated field in the docs here.

When you trigger the DAG and Airflow reaches this task:

  1. It will read dml_merge.sql
  2. Replace {% include 'templates/dml_merge.sql' %} with the contents of the file in "query": "{% include 'templates/dml_merge.sql' %}",.
  3. Run any macros/make any substitutions for variables in the contents of this string.
  4. Run the query as a Job on BigQuery.

You will need to make sure that whatever service account is being used has the right permissions on BigQuery (roles/bigquery.jobUser). The IAM page for BigQuery is here

Daniel T
  • 531
  • 2
  • 5
  • That's a great explaination. Can I do it without using Airflow's Jinja templating system. I can simply create a DAG. Place my sql file under some folder and then just pass the sql file using some operator ? How I can schedule it to run once a day at 3 AM PST? I need to pass current_event_date variable in Airflow config file that will process the data only of last day – Priya Jun 13 '22 at 03:51
  • You can use the `BigQueryInsertJobOperator` and you would pass the SQL file by using the templating system with `"query": "{% include 'templates/dml_merge.sql' %}",`. If you absolutely do not want to use the templating system, you could write a `PythonOperator` that reads the contents of the sql file, and pass that using an XCom. I can provide an example if you need one using that pattern in the answer if you need it. Look at the `schedule_interval` in the [dag options](https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/models/dag/index.html#airflow.models.dag.DAG). – Daniel T Jun 13 '22 at 10:00
  • Yes, I want to use PythonOperator. Would be great and helpful if you can provide an example as you did in 1st example with scheduling part – Priya Jun 13 '22 at 14:46
  • I have tried and used this DAG. It's getting triggered but I am not sure whether it's correct or not because it has inserted the values of today in table but not deleted the tables which has written in last of sql file. Would be great if you can check and let me know. Also let me know regarding scheduling part and I am posting my sql file in my post because it's a big file and I am unable to copy it here. Comments has restriction of some characters, so I am posting my DAG in next 2 comments. – Priya Jun 15 '22 at 03:15
  • """ sql_scripts_folder = os.environ["SQL_SCRIPTS_FOLDER"] QA_CHECK_QUERY= "DMLs/data_qa_checks/DQ_check_base_table_new.sql" yesterday = datetime.datetime.combine( datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time() ) insert_data_from_sql_file = get_query_content( sql_scripts_folder,QA_CHECK_QUERY ) logging.info(f"query: {insert_data_from_sql_file}") – Priya Jun 15 '22 at 03:22
  • default_dag_args = { # Setting start date as yesterday starts the DAG immediately "start_date": yesterday, # To email on failure or retry set 'email' arg to your email "email_on_failure": False, "email_on_retry": False, # If a task fails, retry it once after waiting at least what's specified in retry_delay "retries": 1, "retry_delay": datetime.timedelta(seconds=10), "project_id": event_collection_project_id, } with DAG( dag_id="data_qa_checks", schedule_interval=None, default_args=default_dag_args, ) as dag: – Priya Jun 15 '22 at 03:32
  • # call the query that will insert the data from sql file and it will do the operations which are mentioned in the query DQ_dml = BigQueryInsertJobOperator( task_id="DQ_dml", job_id="{{ ts_nodash }}-DQ_dml", configuration={ "query": { "query": insert_data_from_sql_file, "useLegacySql": "False", }, }, dag=dag, ) – Priya Jun 15 '22 at 03:33
0

As Airflow uses UTC timezone, so I have converted PST to UTC and it's 11 am UST. Wrote a DAG and scheduled it at 11 am

import datetime
import os
import logging
from airflow import DAG
from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
from composer_plugins import get_query_content


# Environments variables
event_collection_project_id = os.environ["EVENT_COLLECTION_PROJECT_ID"]
sql_scripts_folder = os.environ["SQL_SCRIPTS_FOLDER"]


QA_CHECK_QUERY= "DMLs/data_qa_checks/DQ_check_base_table_new_updated.sql"


yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()
)

insert_data_from_sql_file = get_query_content(
    sql_scripts_folder,QA_CHECK_QUERY
)

logging.info(f"query: {insert_data_from_sql_file}")

default_dag_args = {
    # Setting start date as yesterday starts the DAG immediately when it is
    # detected in the Cloud Storage bucket.
    "start_date": yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    "email_on_failure": False,
    "email_on_retry": False,
    # If a task fails, retry it once after waiting at least what's specified in retry_delay
    "retries": 1,
    "retry_delay": datetime.timedelta(seconds=10),
    "project_id": event_collection_project_id,
}

with DAG(
        dag_id="data_qa_checks",
        schedule_interval="0 11 * * *",
        default_args=default_dag_args,
) as dag:

    # call the query that will insert the data from sql file and it will do the operations which are mentioned in the query
    DQ_dml = BigQueryInsertJobOperator(
        task_id="DQ_dml",
        job_id="{{ ts_nodash }}-DQ_dml",
        configuration={
            "query": {
                "query": insert_data_from_sql_file,
                "useLegacySql": "False",
            },
        },
        dag=dag,
    ) 
Priya
  • 37
  • 7