0

This is following on from this question which gets me almost there, but I'd like to only return the rows that have the fullest amount of data.

Extending the solution example from the question linked above:

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT 1 id, 'first_data' col_1, NULL col_2,  '4/22' updated UNION ALL
  SELECT 1,     NULL,             'old',        '4/23'         UNION ALL
  SELECT 1,     NULL,             'correct',    '4/24'         UNION ALL
  SELECT 1,    'next_data',       NULL,         '4/25'         UNION ALL
  SELECT 1,     NULL,             NULL,         '4/26'  
  SELECT 2,     NULL,             'old',        '4/23'         UNION ALL
  SELECT 2,     'this_one',        NULL,        NULL,          UNION ALL            
)
SELECT id, 
  IFNULL(col_1, FIRST_VALUE(col_1 IGNORE NULLS) OVER(win)) col_1, 
  IFNULL(col_2, FIRST_VALUE(col_2 IGNORE NULLS) OVER(win)) col_2, 
  updated
FROM `project.dataset.your_table`
WINDOW win AS (PARTITION BY id ORDER BY updated DESC 
               ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
ORDER BY id, updated

I would like the result to look like:

id  col_1       col_2   updated     
1   next_data   correct 4/26 
2   this_one    old     4/23

I tried this by using ROW_NUMBER() and it works, but it was suggested it is not very memory efficient:

SELECT * EXCEPT(n) FROM (
    WITH `project.dataset.your_table` AS (
      SELECT 1 id, 'first_data' col_1, NULL col_2,  '4/22' updated UNION ALL
      SELECT 1,     NULL,             'old',        '4/23'         UNION ALL
      SELECT 1,     NULL,             'correct',    '4/24'         UNION ALL
      SELECT 1,    'next_data',       NULL,         '4/25'         UNION ALL
      SELECT 1,     NULL,             NULL,         '4/26'         UNION ALL
      SELECT 2,     NULL,             'old',        '4/23'         UNION ALL
      SELECT 2,     'this_one',        NULL,        NULL                     
    )
    SELECT id, 
      IFNULL(col_1, FIRST_VALUE(col_1 IGNORE NULLS) OVER(win)) col_1, 
      IFNULL(col_2, FIRST_VALUE(col_2 IGNORE NULLS) OVER(win)) col_2, 
      updated,
      ROW_NUMBER() OVER (PARTITION BY id) as n
    FROM `project.dataset.your_table`
    WINDOW win AS (PARTITION BY id ORDER BY updated DESC 
                   ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
    ORDER BY id, updated)
 WHERE n = 1

which gives:

id  col_1       col_2   updated     
1   next_data   correct 4/26 
2   this_one    old     4/23
MarkeD
  • 2,500
  • 2
  • 21
  • 35

1 Answers1

4
#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT 1 id, 'first_data' col_1, NULL col_2,  '4/22' updated UNION ALL
  SELECT 1,     NULL,             'old',        '4/23'         UNION ALL
  SELECT 1,     NULL,             'correct',    '4/24'         UNION ALL
  SELECT 1,    'next_data',       NULL,         '4/25'         UNION ALL
  SELECT 1,     NULL,             NULL,         '4/26'         UNION ALL
  SELECT 2,     NULL,             'old',        '4/23'         UNION ALL
  SELECT 2,     'this_one',        NULL,        NULL                 
)
SELECT id, 
  ARRAY_AGG(col_1 IGNORE NULLS ORDER BY updated DESC LIMIT 1)[SAFE_OFFSET(0)] col_1,
  ARRAY_AGG(col_2 IGNORE NULLS ORDER BY updated DESC LIMIT 1)[SAFE_OFFSET(0)] col_2,
  MAX(updated) updated
FROM `project.dataset.your_table`
GROUP BY id   

with result

Row id  col_1       col_2       updated  
1   1   next_data   correct     4/26     
2   2   this_one    old         4/23     
Mikhail Berlyant
  • 165,386
  • 8
  • 154
  • 230
  • 1
    Neat thanks, I'll look at this tomorrow with the real example! – MarkeD Nov 27 '18 at 22:15
  • Great thanks - my data actually didn't have the updated column so I just took `ARRAY_AGG(col1 IGNORE NULLS LIMIT 1)[SAFE_OFFSET(0)] col1,` etc but this answer will be useful for other cases as well, thanks again. – MarkeD Nov 28 '18 at 10:30