files/gitlab-cookbooks/monitoring/templates/postgres-queries.yaml.erb (373 lines of code) (raw):
pg_total_relation_size:
query: |
SELECT relnamespace::regnamespace as schemaname,
relname as relname,
pg_total_relation_size(oid) bytes
FROM pg_class
WHERE relkind = 'r';
metrics:
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- bytes:
usage: "GAUGE"
description: "total disk space usage for the specified table and associated indexes"
pg_blocked:
query: |
SELECT
count(blocked.transactionid) AS queries,
'__transaction__' AS table
FROM pg_catalog.pg_locks blocked
WHERE NOT blocked.granted AND locktype = 'transactionid'
GROUP BY locktype
UNION
SELECT
count(blocked.relation) AS queries,
blocked.relation::regclass::text AS table
FROM pg_catalog.pg_locks blocked
WHERE NOT blocked.granted AND locktype != 'transactionid'
GROUP BY relation
metrics:
- queries:
usage: "GAUGE"
description: "The current number of blocked queries"
- table:
usage: "LABEL"
description: "The table on which a query is blocked"
pg_oldest_blocked:
query: |
SELECT coalesce(extract('epoch' from max(clock_timestamp() - state_change)), 0) age_seconds
FROM pg_stat_activity
WHERE wait_event_type = 'Lock'
AND state='active'
metrics:
- age_seconds:
usage: "GAUGE"
description: "Largest number of seconds any transaction is currently waiting on a lock"
pg_slow:
query: |
SELECT COUNT(*) AS queries
FROM pg_stat_activity
WHERE state = 'active' AND (now() - query_start) > '1 seconds'::interval
metrics:
- queries:
usage: "GAUGE"
description: "Current number of slow queries"
pg_long_running_transactions:
query: |
SELECT COUNT(*) as transactions,
coalesce(MAX(EXTRACT(EPOCH FROM (clock_timestamp() - xact_start))), 0) AS age_in_seconds
FROM pg_stat_activity
WHERE state is distinct from 'idle' AND (now() - xact_start) > '1 minutes'::interval AND query not like 'autovacuum:%'
metrics:
- transactions:
usage: "GAUGE"
description: "Current number of long running transactions"
- age_in_seconds:
usage: "GAUGE"
description: "The current maximum transaction age in seconds"
pg_stuck_idle_in_transaction:
query: |
SELECT COUNT(*) AS queries
FROM pg_stat_activity
WHERE state = 'idle in transaction' AND (now() - query_start) > '10 minutes'::interval
metrics:
- queries:
usage: "GAUGE"
description: "Current number of queries that are stuck being idle in transactions"
# All xid and lsn metrics here are reported mod 2^52 to ensure they
# fit within a float for Prometheus :( Really annoying that counters
# aren't stored in a 64-bit integer. Note that for queries that report
# floats this only works because postgres_exporter does know to set
# extra_float_digits (which it sets to 2). So they don't print in
# exponential notation and precision is maintained up to 2^53-1.
pg_vacuum:
query: |
SELECT
COUNT(*) AS queries,
MAX(EXTRACT(EPOCH FROM (clock_timestamp() - query_start))) AS age_in_seconds
FROM pg_catalog.pg_stat_activity
WHERE state = 'active' AND trim(query) ~* '\AVACUUM (?!ANALYZE)'
metrics:
- queries:
usage: "GAUGE"
description: "The current number of VACUUM queries"
- age_in_seconds:
usage: "GAUGE"
description: "The current maximum VACUUM query age in seconds"
pg_vacuum_analyze:
query: |
SELECT
COUNT(*) AS queries,
MAX(EXTRACT(EPOCH FROM (clock_timestamp() - query_start))) AS age_in_seconds
FROM pg_catalog.pg_stat_activity
WHERE state = 'active' AND trim(query) ~* '\AVACUUM ANALYZE'
metrics:
- queries:
usage: "GAUGE"
description: "The current number of VACUUM ANALYZE queries"
- age_in_seconds:
usage: "GAUGE"
description: "The current maximum VACUUM ANALYZE query age in seconds"
pg_stuck_idle_in_transaction:
query: |
SELECT COUNT(*) AS queries
FROM pg_stat_activity
WHERE state = 'idle in transaction' AND (now() - query_start) > '10 minutes'::interval
metrics:
- queries:
usage: "GAUGE"
description: "Current number of queries that are stuck being idle in transactions"
pg_vacuum_queue:
#master: true
# Until postgres_exporter is upgraded. See
# https://gitlab.com/gitlab-org/omnibus-gitlab/issues/4887
query: |
with table_opts_vs_statistic as (
select
pg_class.oid,
pg_class.relname,
coalesce(nspname, 'public') as schemaname,
pg_class.relpages,
pg_class.reltuples,
case
when array_to_string(reloptions, '') like '%autovacuum_vacuum_threshold%' then regexp_replace(array_to_string(reloptions, ''), '.*autovacuum_vacuum_threshold=([0-9.]+).*', E'\\1')::int8
else current_setting('autovacuum_vacuum_threshold')::int8
end as autovacuum_vacuum_threshold,
case
when array_to_string(reloptions, '') like '%autovacuum_vacuum_scale_factor%' then regexp_replace(array_to_string(reloptions, ''), '.*autovacuum_vacuum_scale_factor=([0-9.]+).*', E'\\1')::numeric
else current_setting('autovacuum_vacuum_scale_factor')::numeric
end as autovacuum_vacuum_scale_factor,
case when array_to_string(reloptions, '') ~ 'autovacuum_enabled=(false|off)' then false else true end as autovacuum_enabled,
n_dead_tup,
last_autovacuum,
last_vacuum
from pg_class
join pg_namespace ns on relnamespace = ns.oid
join pg_stat_all_tables psat on psat.relid = pg_class.oid
where relkind in ('r','m')
), p as (
select pgspv.*,a.query,a.wait_event_type,a.wait_event,a.query_start
from pg_stat_progress_vacuum pgspv
left join pg_stat_activity a using (pid)
)
select
table_opts_vs_statistic.schemaname as schemaname,
table_opts_vs_statistic.relname as relname,
round((100 * table_opts_vs_statistic.n_dead_tup::numeric / nullif(table_opts_vs_statistic.reltuples, 0))::numeric, 2) as dead_tup_pct,
table_opts_vs_statistic.reltuples::numeric as reltuples,
table_opts_vs_statistic.n_dead_tup,
(relpages::bigint*8*1024) AS table_size_bytes,
'V. Threshold:' || table_opts_vs_statistic.autovacuum_vacuum_threshold
|| ', V. Scale Factor: ' || (table_opts_vs_statistic.autovacuum_vacuum_scale_factor)::numeric *100 ||' %'
|| case when not autovacuum_enabled then ', DISABLED' else ', enabled' end as "effective_settings",
case
when last_autovacuum > coalesce(last_vacuum, '0001-01-01') then last_autovacuum::timestamp(0)
when last_vacuum is not null then last_vacuum::timestamp(0)
else null
end as "last_vacuumed",
case
when last_autovacuum > coalesce(last_vacuum, '0001-01-01') then 'auto'
when last_vacuum is not null then 'manual'
else null
end as "type",
coalesce(p.phase, 'in queue') as status,
p.pid as pid,
coalesce (p.query,'')as action,
case when p.pid is null then null else coalesce(wait_event_type ||'.'|| wait_event, 'f') end as waiting,
round(100.0 * p.heap_blks_scanned / nullif(p.heap_blks_total, 0), 1) AS scanned_pct,
round(100.0 * p.heap_blks_vacuumed / nullif(p.heap_blks_total, 0), 1) AS vacuumed_pct,
extract ('epoch' from now()-query_start) elapsed_time
from
table_opts_vs_statistic
full outer join p on p.relid = table_opts_vs_statistic.oid and p.datname = current_database()
where
table_opts_vs_statistic.relpages >= 8
and autovacuum_vacuum_threshold + (autovacuum_vacuum_scale_factor::numeric * table_opts_vs_statistic.reltuples) < table_opts_vs_statistic.n_dead_tup
metrics:
- schemaname:
usage: "LABEL"
description: "Table Schema"
- relname:
usage: "LABEL"
description: "Table name"
- dead_tup_pct:
usage: "GAUGE"
description: "Estimated dead tuples percent"
- reltuples:
usage: "GAUGE"
description: "Number of tuples in table"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead tuples"
- table_size_bytes:
usage: "GAUGE"
description: "Estimated table size"
- effective_settings:
usage: "LABEL"
description: "Autovacuums settings"
- last_vacuumed:
usage: "GAUGE"
description: "Last time at which this table was vacuumed"
- type:
usage: "LABEL"
description: "Last vacuum type"
- status:
usage: "LABEL"
description: "Vacuum actual status"
- pid:
usage: "GAUGE"
description: "Vacuum process id"
- action:
usage: "LABEL"
description: "Type of vacuum executed"
- waiting:
usage: "LABEL"
description: "Vacuum queue status"
- scanned_pct:
usage: "GAUGE"
description: "Estimated rows scanned percent"
- vacuumed_pct:
usage: "GAUGE"
description: "Estimated vacuumed rows percent"
- elapsed_time:
usage: "GAUGE"
description: "Elapsed time vacuuming (in seconds)"
#
# This query extracts marginalia comments from pg_stat_activity and provides a sampled summary of the type of
# endpoints that are actively making calls
#
pg_stat_activity_marginalia_sampler:
query: >
SELECT
usename AS usename,
a.matches[1] AS application,
a.matches[2] AS endpoint,
a.matches[3] AS command,
a.wait_event AS wait_event,
a.state AS state,
a.wait_event_type AS wait_event_type,
COUNT(*) active_count,
MAX(coalesce(age_in_seconds, 0)) AS max_tx_age_in_seconds
FROM (
SELECT
usename,
regexp_matches(query, '^\s*(?:\/\*(?:application:(\w+),?)?(?:correlation_id:\w+,?)?(?:jid:\w+,?)?(?:endpoint_id:([\w/\-\.:\#\s]+),?)?.*?\*\/)?\s*(\w+)') AS matches,
state,
wait_event,
wait_event_type,
EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds
FROM
pg_stat_activity
) a
GROUP BY usename, application, endpoint, command, state, wait_event, wait_event_type
ORDER BY active_count DESC
metrics:
- usename:
usage: LABEL
description: The user running the command
- application:
usage: LABEL
description: Name of the application as presented in marginalia comment
- endpoint:
usage: LABEL
description: Name of the web endpoint or sidekiq job as presented in marginalia comment
- command:
usage: LABEL
description: The first word for the running command
- wait_event:
usage: LABEL
description: Wait event of the activity as presented by pg_stat_activity.wait_event
- state:
usage: LABEL
description: State of the activity as presented by pg_stat_activity.state
- wait_event_type:
usage: LABEL
description: Wait event type of the activity as presented by pg_stat_activity.wait_event_type
- active_count:
usage: GAUGE
description: Number of active queries at time of sample
- max_tx_age_in_seconds:
usage: GAUGE
description: Number of active queries at time of sample
# This records long running autovacuum processes. The reason we limit to
# long-running processes is to avoid cardinality problems in prometheus. Since
# we're only concerned about long running processes we limit the output only to
# those that take longer than 5 minutes.
pg_stat_activity_autovacuum:
query: >
SELECT
SPLIT_PART(query, '.', 2) AS relname,
EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds
FROM
pg_stat_activity
WHERE
query like 'autovacuum:%'
AND
EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) > 300
metrics:
- relname:
usage: LABEL
description: The table being vacuumed
- age_in_seconds:
usage: GAUGE
description: The age of the vacuum process in seconds
# Keep track of the number of the total number of autovacuum workers
# currently active
pg_stat_activity_autovacuum_active:
query: >
SELECT v.phase,
CASE
when a.query ~ '^autovacuum.*to prevent wraparound' then 'wraparound'
when a.query ~* '^vacuum' then 'user'
when a.pid is null then null
ELSE 'regular'
END as mode,
count(1) as workers_count
FROM pg_stat_progress_vacuum v
LEFT JOIN pg_stat_activity a using (pid)
GROUP BY 1,2
metrics:
- phase:
usage: LABEL
description: Vacuum phase
- mode:
usage: LABEL
description: Vacuum mode
- workers_count:
usage: GAUGE
description: The number of active autovacuum workers in this state
#
# This query extracts marginalia metadata from pg_stat_activity and provides a sampled summary about the long running transactions.
#
pg_long_running_transactions_marginalia:
query: >
SELECT
activity.matches[1] AS application,
activity.matches[2] AS endpoint,
MAX(age_in_seconds) AS max_age_in_seconds
FROM (
SELECT
regexp_matches(query, '^\s*(?:\/\*(?:application:(\w+),?)?(?:correlation_id:\w+,?)?(?:jid:\w+,?)?(?:endpoint_id:([\w/\-\.:\#\s]+),?)?.*?\*\/)?\s*(\w+)') AS matches,
EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds
FROM
pg_stat_activity
WHERE state <> 'idle'
AND (clock_timestamp() - xact_start) > '30 seconds'::interval
AND query NOT LIKE 'autovacuum:%'
) activity
GROUP BY application, endpoint
ORDER BY max_age_in_seconds DESC
metrics:
- application:
usage: LABEL
description: Name of the application as presented in marginalia comment
- endpoint:
usage: LABEL
description: Name of the web endpoint or sidekiq job as presented in marginalia comment
- max_age_in_seconds:
usage: GAUGE
description: The current maximum transaction age in seconds