0

Hi and thanks for your time

  • I'm trying to run a process that backfills data between the partition dates of '2021-06-08' and '2021-06-10'.
  • Each partition requires 10 days of data
  • I want to produce a row for every user every day in the partition, so long as they have a row within 10 days
  • That means i want rows on the 8th, 9th and 10th for users uX and uZ
  • I would like the final output to be:
partition_date user item_arry.daydate item_array.item item_array.score
8/6/21 uX 3/6/21 iA 0.2
6/6/21 iB 0.9
8/6/21 uZ 7/6/21 iA 0.9
9/6/21 uX 3/6/21 iA 0.2
6/6/21 iB 0.9
9/6/21 uZ 7/6/21 iA 0.9
10/6/21 uX 3/6/21 iA 0.2
6/6/21 iB 0.9
10/6/21 uZ 7/6/21 iA 0.9
10/6/21 iB 0.8
  • the main trouble i a having is at the final step where I do the second array_agg

  • i would really like to be able to dedupe during this step

  • if you are able to think of a really nice way to do this, even a complete refactor it would be really appreciated

  • thanks in advance :)

CREATE TEMP FUNCTION get_missing_dates(daydate DATE, next_daydate DATE, max_days_forward INT64, to_date DATE) AS (
GENERATE_DATE_ARRAY(
  daydate,
  LEAST(
    DATE_SUB(
      IFNULL(
        IF(DATE_DIFF(next_daydate, daydate, DAY) <= max_days_forward, next_daydate, NULL),
        DATE_ADD(daydate, INTERVAL max_days_forward DAY)),
      INTERVAL 1 DAY)
    ,to_date)
  )
);


WITH 
my_data AS (
--user x
SELECT DATE('2021-06-01') AS daydate, 'uX' AS user, 'iA' AS item, 0.1 AS score UNION ALL
SELECT DATE('2021-06-03') AS daydate, 'uX' AS user, 'iA' AS item, 0.2 AS score UNION ALL
SELECT DATE('2021-06-03') AS daydate, 'uX' AS user, 'iB' AS item, 0.3 AS score UNION ALL
SELECT DATE('2021-06-06') AS daydate, 'uX' AS user, 'iB' AS item, 0.9 AS score UNION ALL
-- user y
SELECT DATE('2021-06-01') AS daydate, 'uZ' AS user, 'iA' AS item, 0.4 AS score UNION ALL
SELECT DATE('2021-06-02') AS daydate, 'uZ' AS user, 'iA' AS item, 0.6 AS score UNION ALL
SELECT DATE('2021-06-07') AS daydate, 'uZ' AS user, 'iA' AS item, 0.9 AS score UNION ALL
SELECT DATE('2021-06-10') AS daydate, 'uZ' AS user, 'iB' AS item, 0.8 AS score
),

find_next_day_on AS (
SELECT 
  daydate,
  user,
  ARRAY_AGG(
    STRUCT(
      daydate,
      item,
      score)) AS item_array,
  -- this produces a date indicating the next day the user was on
  LEAD(daydate, 1) OVER (PARTITION BY user ORDER BY daydate ASC) AS next_daydate
FROM my_data
-- this is the partition dates i would like to run for, with partitions requiring 10 days of data
WHERE daydate BETWEEN DATE_SUB('2021-06-08', INTERVAL 10 DAY) AND '2021-06-10'
GROUP BY 1, 2
),

broadcast_missing_days AS (
SELECT
  missing_days AS new_daydate,
  user,
  item_array
FROM find_next_day_on, 
  UNNEST(get_missing_dates(daydate,next_daydate,10,DATE('2021-06-10'))) AS missing_days
),
build_10day_array AS (
SELECT
  new_daydate,
  user,
  -- ??? IS THERE A WAY TO DEDUPE THIS ARRAY IN THIS STEP SO I GET THE LATEST SCORE FOR EACH ITEM???
  ARRAY_AGG(
    STRUCT(
    daydate,
    item,
    score
    )) OVER (PARTITION BY user ORDER BY UNIX_DATE(new_daydate) RANGE BETWEEN 9 PRECEDING AND 0 PRECEDING)
FROM broadcast_missing_days,
  UNNEST(item_array) AS ia
)
SELECT * FROM build_10day_array ORDER BY user, new_daydate

0 Answers0