I migrated a database from Postgresql 13.7 to Postgresql 14.3 recently, and I saw a x2 execution time in a specific case which I can't understand.
The issue has been seen first on Postgres on Linux, and I reproduced the issue using the following example on macOS with a clean Postgres installed through brew
.
Here is how to reproduce the setup:
-- DB Setup
CREATE DATABASE pg_regression_test;
-- Connect to the new DB
-- Table: units
CREATE TABLE units
(
id INTEGER PRIMARY KEY
);
-- Table: events
-- Value is a payload/metadata JSONB field where you can have
-- the id of a related unit.
-- If it's a unit, then you have a value = {'unit_id': 123}
CREATE TABLE events
(
action VARCHAR NOT NULL,
value JSONB,
timestamp TIMESTAMP NOT NULL
);
CREATE INDEX events_action_idx ON events (action);
CREATE INDEX events_unit_id_idx ON events ((value ->> 'unit_id')) WHERE (value ? 'unit_id');
-- Materialized view: daily_unit_events
-- Serves: how many similar actions have been made on this unit that day?
CREATE MATERIALIZED VIEW daily_unit_events AS
SELECT s.id AS unit_id,
s.action,
s.dt,
SUM(s.count) AS count
FROM (
WITH t AS (
SELECT events.value ->> 'unit_id' AS unit_id,
events.action,
DATE_TRUNC('day', events."timestamp")::DATE AS dt,
1 AS count
FROM events
WHERE events.value ? 'unit_id'
GROUP BY events.value ->> 'unit_id',
events.action,
(DATE_TRUNC('day', events."timestamp")::DATE)
)
SELECT units.id,
t.action,
t.dt,
SUM(t.count) AS count
FROM units
JOIN t ON units.id::TEXT = t.unit_id
GROUP BY units.id, t.action, t.dt
) s
GROUP BY s.id, s.action, s.dt;
CREATE INDEX daily_unit_events_unit_id_idx ON daily_unit_events (unit_id);
-- Unique index is mandatory for REFRESH MATERIALIZED VIEW CONCURRENTLY
CREATE UNIQUE INDEX daily_unit_events_unique_idx ON daily_unit_events (unit_id, action, dt);
-- Data generation
-- Insert 100k units
INSERT INTO units (id)
SELECT i
FROM GENERATE_SERIES(1, 100000) s(i);
-- Insert 1M events with a unit_id in the value. Around 20 kind of events. Est: 42s
INSERT INTO events (action, value, timestamp)
SELECT LEFT(MD5(((RANDOM() * 20)::INT)::TEXT), 12) AS action,
JSONB_BUILD_OBJECT('unit_id', (RANDOM() * 100000)::INT) AS value,
(NOW() - (INTERVAL '1 day' * (RANDOM() * 720)::INT))::TIMESTAMP AS timestamp
FROM GENERATE_SERIES(1, 1000000) s(i);
-- Insert 150k events without a unit_id in the value. Just to be on par with real data.
-- Around 20 kind of other events, distinct from the previous ones. Est: 3s
INSERT INTO events (action, value, timestamp)
SELECT LEFT(MD5(((RANDOM() * 20)::INT + 20)::TEXT), 12) AS action,
'{}'::JSONB AS value,
(NOW() - (INTERVAL '1 day' * (RANDOM() * 720)::INT))::TIMESTAMP AS timestamp
FROM GENERATE_SERIES(1, 150000) s(i);
The issue comes from the materialized view daily_unit_events
, where the refresh is two times as long on PG14 versus PG13:
REFRESH MATERIALIZED VIEW CONCURRENTLY daily_unit_events;
-- With test data, once first run is done. (first run is ~2x longer than follow ups)
-- PG 13.7:
-- - completed in 13 s 340 ms
-- - completed in 13 s 436 ms
-- - completed in 12 s 971 ms
-- - completed in 12 s 976 ms
-- PG 14.3:
-- - completed in 24 s 253 ms
-- - completed in 19 s 462 ms
-- - completed in 21 s 941 ms
-- - completed in 24 s 213 ms
If you take the query used for the materialized view, you can see the same behavior:
SELECT s.id AS unit_id,
s.action,
s.dt,
SUM(s.count) AS count
FROM (
WITH t AS (
SELECT events.value ->> 'unit_id' AS unit_id,
events.action,
DATE_TRUNC('day', events."timestamp")::DATE AS dt,
1 AS count
FROM events
WHERE events.value ? 'unit_id'
GROUP BY (events.value ->> 'unit_id'), events.action,
(DATE_TRUNC('day', events."timestamp")::DATE)
)
SELECT units.id,
t.action,
t.dt,
SUM(t.count) AS count
FROM units
JOIN t ON units.id::TEXT = t.unit_id
GROUP BY units.id, t.action, t.dt
) s
GROUP BY s.id, s.action, s.dt;
-- With test data (unit):
-- PG 13.7:
-- - completed in 3 s 355 ms (execution: 3 s 254 ms, fetching: 101 ms)
-- - completed in 3 s 272 ms (execution: 3 s 234 ms, fetching: 38 ms)
-- - completed in 3 s 331 ms (execution: 3 s 303 ms, fetching: 28 ms)
-- - completed in 3 s 332 ms (execution: 3 s 307 ms, fetching: 25 ms)
-- PG 14.3:
-- - completed in 11 s 436 ms (execution: 11 s 392 ms, fetching: 44 ms)
-- - completed in 14 s 520 ms (execution: 14 s 487 ms, fetching: 33 ms)
-- - completed in 12 s 830 ms (execution: 12 s 39 ms, fetching: 44 ms)
-- - completed in 11 s 345 ms (execution: 11 s 314 ms, fetching: 31 ms)
EXPLAIN (ANALYZE, BUFFERS)
the previous query outputs:
- PG13:
GroupAggregate (cost=5274537.33..13620413.68 rows=5750000 width=53) (actual time=3215.938..4877.993 rows=999667 loops=1)
Group Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=2985402, temp read=4653 written=4678
-> Finalize GroupAggregate (cost=5274537.33..12398538.68 rows=57500000 width=29) (actual time=3215.929..4213.314 rows=999667 loops=1)
Group Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=2985402, temp read=4653 written=4678
-> Gather Merge (cost=5274537.33..11344372.02 rows=47916666 width=29) (actual time=3215.921..3746.462 rows=999667 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=2985402, temp read=4653 written=4678
-> Partial GroupAggregate (cost=5273537.31..5812599.80 rows=23958333 width=29) (actual time=3111.237..3363.517 rows=333222 loops=3)
Group Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=2985402, temp read=4653 written=4678
-> Sort (cost=5273537.31..5333433.14 rows=23958333 width=25) (actual time=3111.218..3177.549 rows=333222 loops=3)
Sort Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Sort Method: external merge Disk: 15016kB
Buffers: shared hit=2985402, temp read=4653 written=4678
Worker 0: Sort Method: external merge Disk: 10256kB
Worker 1: Sort Method: external merge Disk: 11952kB
-> Merge Join (cost=5220.27..617292.54 rows=23958333 width=25) (actual time=90.272..2938.759 rows=333222 loops=3)
Merge Cond: (((units.id)::text) = ((events.value ->> 'unit_id'::text)))
Buffers: shared hit=2985364
-> Sort (cost=5218.20..5322.36 rows=41667 width=4) (actual time=58.281..63.245 rows=33333 loops=3)
Sort Key: ((units.id)::text)
Sort Method: quicksort Memory: 3257kB
Buffers: shared hit=278
Worker 0: Sort Method: quicksort Memory: 2055kB
Worker 1: Sort Method: quicksort Memory: 2278kB
-> Parallel Index Only Scan using units_pkey on units (cost=0.29..2020.96 rows=41667 width=4) (actual time=0.068..26.356 rows=33333 loops=3)
Heap Fetches: 0
Buffers: shared hit=278
-> Materialize (cost=2.07..132774.67 rows=115000 width=53) (actual time=0.466..2643.878 rows=993712 loops=3)
Buffers: shared hit=2985086
-> Group (cost=2.07..131337.17 rows=115000 width=53) (actual time=0.450..2228.330 rows=993712 loops=3)
Group Key: ((events.value ->> 'unit_id'::text)), events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=2985086
-> Incremental Sort (cost=2.07..122951.83 rows=1003045 width=49) (actual time=0.447..1899.806 rows=994041 loops=3)
Sort Key: ((events.value ->> 'unit_id'::text)), events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Presorted Key: ((events.value ->> 'unit_id'::text))
Full-sort Groups: 27025 Sort Method: quicksort Average Memory: 27kB Peak Memory: 27kB
Buffers: shared hit=2985086
Worker 0: Full-sort Groups: 26566 Sort Method: quicksort Average Memory: 27kB Peak Memory: 27kB
Worker 1: Full-sort Groups: 27000 Sort Method: quicksort Average Memory: 27kB Peak Memory: 27kB
-> Index Scan using events_unit_id_idx on events (cost=0.42..73820.46 rows=1003045 width=49) (actual time=0.101..1184.753 rows=994056 loops=3)
Buffers: shared hit=2985028
Planning Time: 0.435 ms
Execution Time: 4974.368 ms
- PG14
GroupAggregate (cost=5272591.23..13616435.63 rows=5748600 width=53) (actual time=9223.276..11004.233 rows=999671 loops=1)
Group Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=3002115, temp read=4653 written=4678
-> Finalize GroupAggregate (cost=5272591.23..12394858.13 rows=57486000 width=29) (actual time=9223.269..10326.712 rows=999671 loops=1)
Group Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=3002115, temp read=4653 written=4678
-> Gather Merge (cost=5272591.23..11340948.13 rows=47905000 width=29) (actual time=9223.261..9827.932 rows=999671 loops=1)
Workers Planned: 2
Workers Launched: 2
Buffers: shared hit=3002115, temp read=4653 written=4678
-> Partial GroupAggregate (cost=5271591.21..5810522.46 rows=23952500 width=29) (actual time=8045.307..8301.748 rows=333224 loops=3)
Group Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=3002115, temp read=4653 written=4678
-> Sort (cost=5271591.21..5331472.46 rows=23952500 width=25) (actual time=8045.284..8115.779 rows=333224 loops=3)
Sort Key: units.id, events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Sort Method: external merge Disk: 17824kB
Buffers: shared hit=3002115, temp read=4653 written=4678
Worker 0: Sort Method: external merge Disk: 10640kB
Worker 1: Sort Method: external merge Disk: 8760kB
-> Merge Join (cost=5220.24..616523.46 rows=23952500 width=25) (actual time=739.098..7296.621 rows=333224 loops=3)
Merge Cond: (((units.id)::text) = ((events.value ->> 'unit_id'::text)))
Buffers: shared hit=3002075
-> Sort (cost=5218.20..5322.36 rows=41667 width=4) (actual time=571.336..577.142 rows=33333 loops=3)
Sort Key: ((units.id)::text)
Sort Method: quicksort Memory: 3613kB
Buffers: shared hit=278
Worker 0: Sort Method: quicksort Memory: 2107kB
Worker 1: Sort Method: quicksort Memory: 1870kB
-> Parallel Index Only Scan using units_pkey on units (cost=0.29..2020.96 rows=41667 width=4) (actual time=0.063..15.491 rows=33333 loops=3)
Heap Fetches: 0
Buffers: shared hit=278
-> Materialize (cost=2.05..132122.18 rows=114972 width=53) (actual time=0.434..6303.380 rows=999327 loops=3)
Buffers: shared hit=3001797
-> Group (cost=2.05..130685.03 rows=114972 width=53) (actual time=0.425..5882.899 rows=999327 loops=3)
Group Key: ((events.value ->> 'unit_id'::text)), events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Buffers: shared hit=3001797
-> Incremental Sort (cost=2.05..122329.63 rows=999082 width=49) (actual time=0.423..5532.640 rows=999650 loops=3)
Sort Key: ((events.value ->> 'unit_id'::text)), events.action, ((date_trunc('day'::text, events.""timestamp""))::date)
Presorted Key: ((events.value ->> 'unit_id'::text))
Full-sort Groups: 27020 Sort Method: quicksort Average Memory: 28kB Peak Memory: 28kB
Buffers: shared hit=3001797
Worker 0: Full-sort Groups: 26999 Sort Method: quicksort Average Memory: 28kB Peak Memory: 28kB
Worker 1: Full-sort Groups: 27023 Sort Method: quicksort Average Memory: 28kB Peak Memory: 28kB
-> Index Scan using events_unit_id_idx on events (cost=0.42..73541.97 rows=999082 width=49) (actual time=0.049..1407.220 rows=999666 loops=3)
Buffers: shared hit=3001739
Planning Time: 0.269 ms
Execution Time: 11103.528 ms
Both query plans are identical, estimated costs and buffers looks the same, but at the end the execution time is more than two times higher.
On macOS, output of SELECT * FROM pg_config;
is almost identical, they are the defaults when installing from brew
. The only difference is in the CFLAGS
entry, where PG14 has been built with an additional -Werror=unguarded-availability-new
(I stripped down the *DIR
for conciseness)
PGXS,/usr/local/lib/postgresql/pgxs/src/makefiles/pgxs.mk
CONFIGURE, '--disable-debug' '--prefix=/usr/local/Cellar/postgresql/14.3' '--datadir=/usr/local/share/postgresql' '--libdir=/usr/local/lib' '--includedir=/usr/local/include' '--sysconfdir=/usr/local/etc' '--docdir=/usr/local/Cellar/postgresql/14.3/share/doc/postgresql' '--enable-thread-safety' '--with-gssapi' '--with-icu' '--with-ldap' '--with-libxml' '--with-libxslt' '--with-openssl' '--with-pam' '--with-perl' '--with-uuid=e2fs' '--with-bonjour' '--with-tcl' 'PG_SYSROOT=/Library/Developer/CommandLineTools/SDKs/MacOSX11.sdk' 'CC=clang' 'LDFLAGS=-L/usr/local/opt/openssl@1.1/lib -L/usr/local/opt/readline/lib' 'CPPFLAGS=-I/usr/local/opt/openssl@1.1/include -I/usr/local/opt/readline/include' 'CXX=clang++' 'PKG_CONFIG_PATH=/usr/local/opt/icu4c/lib/pkgconfig:/usr/local/opt/openssl@1.1/lib/pkgconfig:/usr/local/opt/krb5/lib/pkgconfig:/usr/local/opt/readline/lib/pkgconfig' 'PKG_CONFIG_LIBDIR=/usr/lib/pkgconfig:/usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig/11'
CC,clang
CPPFLAGS,-I/usr/local/Cellar/icu4c/70.1/include -isysroot /Library/Developer/CommandLineTools/SDKs/MacOSX11.sdk -I/usr/local/opt/openssl@1.1/include -I/usr/local/opt/readline/include
CFLAGS,-Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Werror=vla -Werror=unguarded-availability-new -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -Wno-unused-command-line-argument -Wno-compound-token-split-by-macro -O2
CFLAGS_SL,""
LDFLAGS,"-isysroot /Library/Developer/CommandLineTools/SDKs/MacOSX11.sdk -L/usr/local/opt/openssl@1.1/lib -L/usr/local/opt/readline/lib -Wl,-dead_strip_dylibs"
LDFLAGS_EX,""
LDFLAGS_SL,""
LIBS,-lpgcommon -lpgport -lxslt -lxml2 -lpam -lssl -lcrypto -lgssapi_krb5 -lz -lreadline -lm
VERSION,PostgreSQL 14.3
On the pg_settings
side, it's also almost identical:
-- Common config keys:
server_version,14.3,
server_version_num,140003,
lc_collate,en_US.UTF-8,
lc_ctype,en_US.UTF-8,
lc_messages,en_US.UTF-8,
lc_monetary,en_US.UTF-8,
lc_numeric,en_US.UTF-8,
lc_time,en_US.UTF-8,
server_version,13.7,
server_version_num,130007,
lc_collate,C,
lc_ctype,C,
lc_messages,C,
lc_monetary,C,
lc_numeric,C,
lc_time,C,
-- Only present in one
server_version,13.7,
operator_precedence_warning,off,
vacuum_cleanup_index_scale_factor,0.1,
server_version,14.3,
client_connection_check_interval,0,ms
compute_query_id,auto,
debug_discard_caches,0,
default_toast_compression,pglz,
enable_async_append,on,
enable_memoize,on,
huge_page_size,0,kB
idle_session_timeout,0,ms
in_hot_standby,off,
log_recovery_conflict_waits,off,
min_dynamic_shared_memory,0,MB
recovery_init_sync_method,fsync,
remove_temp_files_after_crash,on,
ssl_crl_dir,"",
update_process_title,on,
vacuum_failsafe_age,1600000000,
vacuum_multixact_failsafe_age,1600000000,
What could be the cause of this performance regression? Thank you!