2
create table events(
    id char(36) PRIMARY KEY,
    game_id varchar(24) not null, 
    user_device_id char(36) not null, 
    event_name varchar(100) not null, 
    generated_at timestamp with time zone not null
);

SELECT 
    events.generated_at::DATE AS time_stamp, 
    COUNT(DISTINCT ( 
        CASE WHEN 
            events.event_name = 'new_user' THEN events.user_device_id 
        END
        )
    ) as new_users, 
    COUNT(DISTINCT (
        CASE WHEN 
            future_events.event_name <> 'new_user' THEN future_events.user_device_id 
        END
        )
    ) as returned_users, 
    COUNT(DISTINCT (
        CASE WHEN 
            future_events.event_name <> 'new_user' THEN future_events.user_device_id 
        END
    )) / COUNT(DISTINCT (
        CASE WHEN 
            events.event_name = 'new_user' THEN events.user_device_id 
        END
    ))::float as retention 
FROM events 
    LEFT JOIN events AS future_events ON 
        events.user_device_id = future_events.user_device_id AND 
        events.generated_at = future_events.generated_at - interval '1 day' AND 
        events.game_id = future_events.game_id
GROUP BY 
    time_stamp 
ORDER BY 
    time_stamp;

I am trying to get the Day N ('N' -> any number between 1 to 7) user retention via the above sql query. Due to the fact that I am a noob in HPE vertica, I am not being able to come up the optimum aggregate projection creating statement, Since projection significantly improves the performance of the query.

Antti29
  • 2,953
  • 12
  • 34
  • 36
Prata
  • 1,250
  • 2
  • 16
  • 31
  • add explain before the select, execute & and share the result – whoopdedoo Aug 14 '17 at 16:28
  • 1
    @Up_One why is that? – Prata Aug 15 '17 at 07:54
  • @Prataksha - the overall price payed is not replaced by the speed. Think about the ETL 2 Dashboards steps. You have to many restrictions when working with AGG Proj and the added complexity makes it not very usable – Up_One Aug 16 '17 at 00:32

1 Answers1

1

Aggregated projection won't help with a join query.

You can create a regular projection, segmented and sorted by the join columns, to achieve performance improvement:

CREATE PROJECTION events_p1 (
id,
game_id ENCODING RLE,
user_device_id ENCODING RLE,
event_name,
generated_at ENCODING RLE
) AS
SELECT id,
       game_id,
       user_device_id,
       event_name,
       generated_at
FROM events
ORDER BY generated_at,
         game_id, 
         user_device_id 
SEGMENTED BY hash(generated_at,game_id,user_device_id) ALL NODES KSAFE 1;
ya24
  • 490
  • 1
  • 4
  • 16