Hi and thanks for your time
- I'm trying to run a process that backfills data between the partition dates of '2021-06-08' and '2021-06-10'.
- Each partition requires 10 days of data
- I want to produce a row for every user every day in the partition, so long as they have a row within 10 days
- That means i want rows on the 8th, 9th and 10th for users uX and uZ
- I would like the final output to be:
partition_date | user | item_arry.daydate | item_array.item | item_array.score |
---|---|---|---|---|
8/6/21 | uX | 3/6/21 | iA | 0.2 |
6/6/21 | iB | 0.9 | ||
8/6/21 | uZ | 7/6/21 | iA | 0.9 |
9/6/21 | uX | 3/6/21 | iA | 0.2 |
6/6/21 | iB | 0.9 | ||
9/6/21 | uZ | 7/6/21 | iA | 0.9 |
10/6/21 | uX | 3/6/21 | iA | 0.2 |
6/6/21 | iB | 0.9 | ||
10/6/21 | uZ | 7/6/21 | iA | 0.9 |
10/6/21 | iB | 0.8 |
the main trouble i a having is at the final step where I do the second
array_agg
i would really like to be able to dedupe during this step
if you are able to think of a really nice way to do this, even a complete refactor it would be really appreciated
thanks in advance :)
CREATE TEMP FUNCTION get_missing_dates(daydate DATE, next_daydate DATE, max_days_forward INT64, to_date DATE) AS (
GENERATE_DATE_ARRAY(
daydate,
LEAST(
DATE_SUB(
IFNULL(
IF(DATE_DIFF(next_daydate, daydate, DAY) <= max_days_forward, next_daydate, NULL),
DATE_ADD(daydate, INTERVAL max_days_forward DAY)),
INTERVAL 1 DAY)
,to_date)
)
);
WITH
my_data AS (
--user x
SELECT DATE('2021-06-01') AS daydate, 'uX' AS user, 'iA' AS item, 0.1 AS score UNION ALL
SELECT DATE('2021-06-03') AS daydate, 'uX' AS user, 'iA' AS item, 0.2 AS score UNION ALL
SELECT DATE('2021-06-03') AS daydate, 'uX' AS user, 'iB' AS item, 0.3 AS score UNION ALL
SELECT DATE('2021-06-06') AS daydate, 'uX' AS user, 'iB' AS item, 0.9 AS score UNION ALL
-- user y
SELECT DATE('2021-06-01') AS daydate, 'uZ' AS user, 'iA' AS item, 0.4 AS score UNION ALL
SELECT DATE('2021-06-02') AS daydate, 'uZ' AS user, 'iA' AS item, 0.6 AS score UNION ALL
SELECT DATE('2021-06-07') AS daydate, 'uZ' AS user, 'iA' AS item, 0.9 AS score UNION ALL
SELECT DATE('2021-06-10') AS daydate, 'uZ' AS user, 'iB' AS item, 0.8 AS score
),
find_next_day_on AS (
SELECT
daydate,
user,
ARRAY_AGG(
STRUCT(
daydate,
item,
score)) AS item_array,
-- this produces a date indicating the next day the user was on
LEAD(daydate, 1) OVER (PARTITION BY user ORDER BY daydate ASC) AS next_daydate
FROM my_data
-- this is the partition dates i would like to run for, with partitions requiring 10 days of data
WHERE daydate BETWEEN DATE_SUB('2021-06-08', INTERVAL 10 DAY) AND '2021-06-10'
GROUP BY 1, 2
),
broadcast_missing_days AS (
SELECT
missing_days AS new_daydate,
user,
item_array
FROM find_next_day_on,
UNNEST(get_missing_dates(daydate,next_daydate,10,DATE('2021-06-10'))) AS missing_days
),
build_10day_array AS (
SELECT
new_daydate,
user,
-- ??? IS THERE A WAY TO DEDUPE THIS ARRAY IN THIS STEP SO I GET THE LATEST SCORE FOR EACH ITEM???
ARRAY_AGG(
STRUCT(
daydate,
item,
score
)) OVER (PARTITION BY user ORDER BY UNIX_DATE(new_daydate) RANGE BETWEEN 9 PRECEDING AND 0 PRECEDING)
FROM broadcast_missing_days,
UNNEST(item_array) AS ia
)
SELECT * FROM build_10day_array ORDER BY user, new_daydate