All Redshift monitoring queries in one place.
NOTE: All queries are provided by Redshift Documentation provided by AWS. Here is the link for all the system tables:
And the Redshift Utility information:
--reason why s3 copy failed
select d.query, substring(d.filename,14,20),
d.line_number as line,
substring(d.value,1,16) as value,
substring(le.err_reason,1,48) as err_reason
from stl_loaderror_detail d, stl_load_errors le
where d.query = le.query
and d.query = <queryID>;
--copy performance information from past 7 days
select q.starttime, s.query, substring(q.querytxt,1,120) as querytxt,
s.n_files, size_mb, s.time_seconds,
s.size_mb/decode(s.time_seconds,0,1,s.time_seconds) as mb_per_s
from (select query, count(*) as n_files,
sum(transfer_size/(1024*1024)) as size_MB, (max(end_Time) -
min(start_Time))/(1000000) as time_seconds , max(end_time) as end_time
from stl_s3client where http_method = 'GET' and query > 0
and transfer_time > 0 group by query ) as s
LEFT JOIN stl_Query as q on q.query = s.query
where s.end_Time >= dateadd(day, -7, current_Date)
order by s.time_Seconds desc, size_mb desc, s.end_time desc
limit 50;
--sessions with currently running queries
SELECT s.process AS pid
,date_Trunc ('second',s.starttime) AS S_START
,datediff(minutes,s.starttime,getdate ()) AS conn_mins
,trim(s.user_name) AS USER
,trim(s.db_name) AS DB
,date_trunc ('second',i.starttime) AS Q_START
,i.query
,trim(i.query) AS sql
FROM stv_sessions s
LEFT JOIN stv_recents i
ON s.process = i.pid
AND i.status = 'Running'
WHERE s.user_name <> 'rdsdb'
ORDER BY 1;
/* Query showing information about sessions with currently running queries */
SELECT s.process AS pid
,date_Trunc ('second',s.starttime) AS S_START
,datediff(minutes,s.starttime,getdate ()) AS conn_mins
,trim(s.user_name) AS USER
,trim(s.db_name) AS DB
,date_trunc ('second',i.starttime) AS Q_START
,i.query
,trim(i.query) AS sql
FROM stv_sessions s
LEFT JOIN stv_recents i
ON s.process = i.pid
AND i.status = 'Running'
WHERE s.user_name <> 'rdsdb'
ORDER BY 1;
--running queues:
select trim(u.usename) as user, s.pid, q.xid,q.query,q.service_class as "q", q.slot_count as slt, date_trunc('second',q.wlm_start_time) as start,decode(trim(q.state), 'Running','Run','QueuedWaiting','Queue','Returning','Return',trim(q.state)) as state,
q.queue_Time/1000000 as q_sec, q.exec_time/1000000 as exe_sec, m.cpu_time/1000000 cpu_sec, m.blocks_read read_mb, decode(m.blocks_to_disk,-1,null,m.blocks_to_disk) spill_mb , m2.rows as ret_rows, m3.rows as NL_rows,
substring(replace(nvl(qrytext_cur.text,trim(translate(s.text,chr(10)||chr(13)||chr(9) ,''))),'\\n',' '),1,90) as sql,
trim(decode(event&1,1,'SK ','') || decode(event&2,2,'Del ','') || decode(event&4,4,'NL ','') || decode(event&8,8,'Dist ','') || decode(event&16,16,'Bcast ','') || decode(event&32,32,'Stats ','')) as Alert
from stv_wlm_query_state q
left outer join stl_querytext s on (s.query=q.query and sequence = 0)
left outer join stv_query_metrics m on ( q.query = m.query and m.segment=-1 and m.step=-1 )
left outer join stv_query_metrics m2 on ( q.query = m2.query and m2.step_type = 38 )
left outer join ( select query, sum(rows) as rows from stv_query_metrics m3 where step_type = 15 group by 1) as m3 on ( q.query = m3.query )
left outer join pg_user u on ( s.userid = u.usesysid )
LEFT OUTER JOIN (SELECT ut.xid,'CURSOR ' || TRIM( substring ( TEXT from strpos(upper(TEXT),'SELECT') )) as TEXT
FROM stl_utilitytext ut
WHERE sequence = 0
AND upper(TEXT) like 'DECLARE%'
GROUP BY text, ut.xid) qrytext_cur ON (q.xid = qrytext_cur.xid)
left outer join ( select query,sum(decode(trim(split_part(event,':',1)),'Very selective query filter',1,'Scanned a large number of deleted rows',2,'Nested Loop Join in the query plan',4,'Distributed a large number of rows across the network',8,'Broadcasted a large number of rows across the network',16,'Missing query planner statistics',32,0)) as event from STL_ALERT_EVENT_LOG
where event_time >= dateadd(hour, -8, current_Date) group by query ) as alrt on alrt.query = q.query
order by q.service_class,q.exec_time desc, q.wlm_start_time;
/**********************************************************************************************
Purpose: Return Alerts from past 7 days
Columns:
table: Name of the table where the alert happened, when applicable
minutes: Number of minutes spent doing the action. Not available for all alerts
rows: Number for rows of the scan/dist/bcast
event: What is the Alerted Event
solution Proposed Solution to avoid the alert (and performance issue) in the future
sample_query: query_id of the latest occurency of that alert
count: Number of occurences of the alert
Notes:
History:
2015-02-09 ericfe created
2015-04-17 ericfe Added detail information on distributions and broadcasts. Added rows column
2017-08-11 andrewgross Added schema and query text information.
**********************************************************************************************/
SELECT t.schema AS SCHEMA,
trim(s.perm_table_name) AS TABLE,
(sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), CASE WHEN coalesce(b.endtime,d.endtime,s.endtime) > coalesce(b.starttime,d.starttime,s.starttime) THEN coalesce(b.endtime,d.endtime,s.endtime) ELSE coalesce(b.starttime,d.starttime,s.starttime) END)))/60)::numeric(24,0) AS minutes,
sum(coalesce(b.rows,d.rows,s.rows)) AS ROWS,
trim(split_part(l.event,':',1)) AS event,
substring(trim(l.solution),1,60) AS solution,
max(l.query) AS sample_query,
count(DISTINCT l.query),
q.text AS query_text
FROM stl_alert_event_log AS l
LEFT JOIN stl_scan AS s ON s.query = l.query
AND s.slice = l.slice
AND s.segment = l.segment
LEFT JOIN stl_dist AS d ON d.query = l.query
AND d.slice = l.slice
AND d.segment = l.segment
LEFT JOIN stl_bcast AS b ON b.query = l.query
AND b.slice = l.slice
AND b.segment = l.segment
LEFT JOIN
(SELECT query,
LISTAGG(text) WITHIN
GROUP (
ORDER BY sequence) AS text
FROM stl_querytext
WHERE sequence < 100
GROUP BY query) AS q ON q.query = l.query
LEFT JOIN svv_table_info AS t ON t.table_id = s.tbl
WHERE l.userid >1
AND l.event_time >= dateadd(DAY, -7, CURRENT_DATE)
AND s.perm_table_name NOT LIKE 'volt_tt%'
AND SCHEMA IS NOT NULL
GROUP BY 1,
2,
5,
6,
query_text
ORDER BY 3 DESC, 7 DESC;
/**********************************************************************************************
Purpose: Returns the per-hour Resources usage per queue for the past 2 days.
These results can be used to fine tune WLM queues and find peak times for workload.
Columns:
exec_hour: Hour of execution of queries
q: ID for the service class, defined in the WLM configuration file.
n_cp: Number of queries executed on that queue/hour
avg_q_sec: Average Queueing time in seconds
avg_e_sec: Averagte Executiong time in seconds
avg_pct_cpu: Average percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems
max_pct_cpu: Max percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems
sum_spill_mb: Sum of Spill usage by that queue on that hour
sum_row_scan: Sum of rows scanned on that queue/hour
sum_join_rows: Sum of rows joined on that queue/hour
sum_nl_join_rows: Sum of rows Joined using Nested Loops on that queue/hour
sum_ret_rows: Sum of rows returned to the leader/client on that queue/hour
sum_spec_mb: Sum of Megabytes scanned by a Spectrum query on that queue/hour
Notes:
History:
2017-08-09 ericnf created
2017-12-18 ericnf add rows for cached queries
**********************************************************************************************/
select date_trunc('hour', convert_timezone('utc','utc',w.exec_start_time)) as exec_hour, w.service_class as "Q", sum(decode(w.final_state, 'Completed',1,'Evicted',0,0)) as n_cp, sum(decode(w.final_state, 'Completed',0,'Evicted',1,0)) as n_ev, avg(w.total_queue_time/1000000) as avg_q_sec, avg(w.total_exec_time/1000000) as avg_e_sec,
avg(m.query_cpu_usage_percent) as avg_pct_cpu, max(m.query_cpu_usage_percent) as max_pct_cpu, max(m.query_temp_blocks_to_disk) as max_spill, sum(m.query_temp_blocks_to_disk) as sum_spill_mb, sum(m.scan_row_count) as sum_row_scan, sum(m.join_row_count) as sum_join_rows, sum(m.nested_loop_join_row_count) as sum_nl_join_rows,
sum(m.return_row_count) as sum_ret_rows, sum(m.spectrum_scan_size_mb) as sum_spec_mb
from stl_wlm_query as w left join svl_query_metrics_summary as m using (userid,service_Class, query)
where service_class > 5
and w.exec_start_time >= dateadd(day, -1, current_Date) group by 1,2
union all
select date_trunc('hour', convert_timezone('utc','utc',c.starttime)) as exec_hour, 0 as "Q", sum(decode(c.aborted, 1,0,1)) as n_cp, sum(decode(c.aborted, 1,1,0)) as n_ev, 0 as avg_q_sec, avg(c.elapsed/1000000) as avg_e_sec,
0 as avg_pct_cpu, 0 as max_pct_cpu, 0 as max_spill, 0 as sum_spill_mb, 0 as sum_row_scan, 0 as sum_join_rows, 0 as sum_nl_join_rows, sum(m.return_row_count) as sum_ret_rows, 0 as sum_spec_mb
from svl_qlog c left join svl_query_metrics_summary as m on ( c.userid = m.userid and c.source_query=m.query )
where source_query is not null and c.starttime >= dateadd(day, -1, current_Date)
group by 1,2 order by 1 desc,2 ;
/**********************************************************************************************
Purpose: Return Table related Performance Allerts from past 7 days
Columns:
schemaL Name of Schema
table: Name of the table where the alert happened, when applicable
table_rows: Size of table in rows
minutes: Number of minutes spent doing the action. Not available for all alerts
scanned_rows: Number for rows of the scan/dist/bcast
event: What is the Alerted Event
solution Proposed Solution to avoid the alert (and performance issue) in the future
sample_query: query_id of the latest occurency of that alert
count: Number of occurences of the alert
Notes:
History:
2015-02-09 ericfe created
2018-09-12 ericfe re-publish to differentiate with the one with SQL text
**********************************************************************************************/
select trim(pgn.nspname) as Schema, trim(s.perm_table_name) as table , tot_rows as table_rows, (sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), coalesce(b.endtime,d.endtime,s.endtime))))/60)::numeric(24,0) as minutes,
sum(coalesce(b.rows,d.rows,s.rows)) as scanned_rows, trim(split_part(l.event,':',1)) as event, substring(trim(l.solution),1,60) as solution , max(l.query) as sample_query, count(*) as count
from stl_alert_event_log as l
left join stl_scan as s on s.query = l.query and s.slice = l.slice and s.segment = l.segment
left join stl_dist as d on d.query = l.query and d.slice = l.slice and d.segment = l.segment
left join stl_bcast as b on b.query = l.query and b.slice = l.slice and b.segment = l.segment
left join ( select id, sum(rows) as tot_rows from stv_Tbl_perm group by id) as t on s.tbl = t.id
left join pg_class as pgc on pgc.oid = s.tbl left join pg_namespace as pgn on pgn.oid = pgc.relnamespace
where l.userid >1 and l.event_time >= dateadd(day, -7, getdate()) AND s.perm_table_name NOT LIKE 'volt_tt%' AND s.perm_table_name NOT LIKE 'Internal Worktable'
group by 1,2,3, 6,7 order by 4 desc,8 desc;
/**********************************************************************************************
Purpose: Return Table storage information (size, skew, etc)
Columns:
schema: Schema name
Table: Table name
id: Table id
DistKey: Distribution Key (shows EVEN for event disttributed, ALL for Diststyle ALL)
Skew: Table Skew. Proportion between largest slice and smallest slice (null for diststyle ALL)
Sortkey: First column of Sortkey
#SKs: Number of columns in the compound sortkey
rows: Number of rows
mbytes: Size of the table in Megabytes
Enc: Y if the table has at least one compressed column, N otherwise
pct_enc: Proportion of number of encoded columns to total number of columns
pct_of_total: Size of the table in proportion to the cluster size
pct_stats_off: Measure of staleness of table statistics (real size versus size recorded in stats)
pct_unsorted: Proportion of unsorted rows compared to total rows
Notes:
History:
2015-02-16 ericfe created
2017-03-23 thiyagu Added percentage encoded column metric (pct_enc) and fixes
2017-10-01 mscaer Fixed columns "rows", pct_stats_off, and pct_unsorted to be correct for DISTSTYLE ALL.
**********************************************************************************************/
SELECT TRIM(pgn.nspname) AS SCHEMA,
TRIM(a.name) AS TABLE,
id AS TableId,
decode(pgc.reldiststyle,
0, 'EVEN',
1,det.distkey ,
8,'ALL'
) AS DistKey,
decode(pgc.reldiststyle,
8,NULL,
dist_ratio.ratio::DECIMAL(20,4)
) AS Skew,
det.head_sort AS "SortKey",
det.n_sortkeys AS "#SKs",
CASE WHEN pgc.reldiststyle = 8 THEN a.rows_all_dist ELSE a.rows END AS rows,
b.mbytes,
decode(det.max_enc,
0,'N',
'Y'
) AS Enc,
det.pct_enc,
decode(b.mbytes,
0,0,
((b.mbytes/part.total::DECIMAL)*100)::DECIMAL(20,2)
) AS pct_of_total,
(CASE WHEN a.rows = 0 THEN NULL ELSE
CASE WHEN pgc.reldiststyle = 8 THEN ((a.rows_all_dist - pgc.reltuples)::DECIMAL(20,3) / a.rows_all_dist::DECIMAL(20,3)*100)::DECIMAL(20,2)
ELSE ((a.rows - pgc.reltuples)::DECIMAL(20,3) / a.rows::DECIMAL(20,3)*100)::DECIMAL(20,2) END END
) AS pct_stats_off,
CASE WHEN pgc.reldiststyle = 8
THEN decode( det.n_sortkeys,0, NULL,DECODE( a.rows_all_dist,0,0, (a.unsorted_rows_all_dist::DECIMAL(32)/a.rows_all_dist)*100))::DECIMAL(20,2)
ELSE decode( det.n_sortkeys,0, NULL,DECODE( a.rows,0,0, (a.unsorted_rows::DECIMAL(32)/a.rows)*100))::DECIMAL(20,2) END
AS pct_unsorted
FROM (SELECT db_id,
id,
name,
SUM(ROWS) AS ROWS,
MAX(ROWS) AS rows_all_dist,
SUM(ROWS) - SUM(sorted_rows) AS unsorted_rows,
MAX(ROWS) - MAX(sorted_rows) AS unsorted_rows_all_dist
FROM stv_tbl_perm a
GROUP BY db_id,
id,
name) AS a
JOIN pg_class AS pgc ON pgc.oid = a.id
JOIN pg_namespace AS pgn ON pgn.oid = pgc.relnamespace
LEFT OUTER JOIN (SELECT tbl, COUNT(*) AS mbytes FROM stv_blocklist GROUP BY tbl) b ON a.id = b.tbl
INNER JOIN (SELECT attrelid,
MIN(CASE attisdistkey WHEN 't' THEN attname ELSE NULL END) AS "distkey",
MIN(CASE attsortkeyord WHEN 1 THEN attname ELSE NULL END) AS head_sort,
MAX(attsortkeyord) AS n_sortkeys,
MAX(attencodingtype) AS max_enc,
SUM(case when attencodingtype <> 0 then 1 else 0 end)::DECIMAL(20,3)/COUNT(attencodingtype)::DECIMAL(20,3) *100.00 as pct_enc
FROM pg_attribute
GROUP BY 1) AS det ON det.attrelid = a.id
INNER JOIN (SELECT tbl,
MAX(Mbytes)::DECIMAL(32) /MIN(Mbytes) AS ratio
FROM (SELECT tbl,
TRIM(name) AS name,
slice,
COUNT(*) AS Mbytes
FROM svv_diskusage
GROUP BY tbl,
name,
slice)
GROUP BY tbl,
name) AS dist_ratio ON a.id = dist_ratio.tbl
JOIN (SELECT SUM(capacity) AS total
FROM stv_partitions
WHERE part_begin = 0) AS part ON 1 = 1
WHERE mbytes IS NOT NULL
AND pgc.relowner > 1
-- and pgn.nspname = 'schema' -- schemaname
-- and a.name like 'table%' -- tablename
-- and det.max_enc = 0 -- non-compressed tables
ORDER BY mbytes DESC;
/*
Table Skew Inspector. Please see http://docs.aws.amazon.com/redshift/latest/dg/c_analyzing-table-design.html
for more information.
Notes:
History:
2015-11-26 meyersi created
2016-09-13 chriz-bigdata rewrote to simplify and align with documentation
*/
SELECT SCHEMA schemaname,
"table" tablename,
table_id tableid,
size size_in_mb,
CASE
WHEN diststyle NOT IN ('EVEN','ALL') THEN 1
ELSE 0
END has_dist_key,
CASE
WHEN sortkey1 IS NOT NULL THEN 1
ELSE 0
END has_sort_key,
CASE
WHEN encoded = 'Y' THEN 1
ELSE 0
END has_col_encoding,
ROUND(100*CAST(max_blocks_per_slice - min_blocks_per_slice AS FLOAT) / GREATEST(NVL (min_blocks_per_slice,0)::int,1),2) ratio_skew_across_slices,
ROUND(CAST(100*dist_slice AS FLOAT) /(SELECT COUNT(DISTINCT slice) FROM stv_slices),2) pct_slices_populated
FROM svv_table_info ti
JOIN (SELECT tbl,
MIN(c) min_blocks_per_slice,
MAX(c) max_blocks_per_slice,
COUNT(DISTINCT slice) dist_slice
FROM (SELECT b.tbl,
b.slice,
COUNT(*) AS c
FROM STV_BLOCKLIST b
GROUP BY b.tbl,
b.slice)
WHERE tbl IN (SELECT table_id FROM svv_table_info)
GROUP BY tbl) iq ON iq.tbl = ti.table_id
ORDER BY SCHEMA, "table";
/**********************************************************************************************
Purpose: Return the top 50 time consuming statements aggregated by it's text.
Columns:
DB: Database where the query ran
n_qry: Number of Queries with same SQL text
qrytext: First 80 Characters of the query SQL
min/max/avg: Min/Max/Avg Execution time
total: Total execution time of all occurences
max_query_id: Largest query id of the query occurence
last_run: Last day the query ran
aborted: 0 if query ran to completion, 1 if it was canceled.
alerts: Alert events related to the query
Notes:
There is a commented filter of the query to filter for only Select statements (otherwise it includes all statements like insert, update, COPY)
There is a commented filter to narrow the query to a given database
History:
2015-02-09 ericfe created
2015-04-17 ericfe Added event name and event time filter
**********************************************************************************************/
-- query runtimes
select trim(database) as DB, count(query) as n_qry, max(substring (qrytext,1,80)) as qrytext, min(run_seconds) as "min" , max(run_seconds) as "max", avg(run_seconds) as "avg", sum(run_seconds) as total, max(query) as max_query_id,
max(starttime)::date as last_run, aborted,
listagg(event, ', ') within group (order by query) as events
from (
select userid, label, stl_query.query, trim(database) as database, trim(querytxt) as qrytext, md5(trim(querytxt)) as qry_md5, starttime, endtime, datediff(seconds, starttime,endtime)::numeric(12,2) as run_seconds,
aborted, decode(alrt.event,'Very selective query filter','Filter','Scanned a large number of deleted rows','Deleted','Nested Loop Join in the query plan','Nested Loop','Distributed a large number of rows across the network','Distributed','Broadcasted a large number of rows across the network','Broadcast','Missing query planner statistics','Stats',alrt.event) as event
from stl_query
left outer join ( select query, trim(split_part(event,':',1)) as event from STL_ALERT_EVENT_LOG where event_time >= dateadd(day, -7, current_Date) group by query, trim(split_part(event,':',1)) ) as alrt on alrt.query = stl_query.query
where userid <> 1
-- and (querytxt like 'SELECT%' or querytxt like 'select%' )
-- and database = ''
and starttime >= dateadd(day, -7, current_Date)
)
group by database, label, qry_md5, aborted
order by total desc limit 50;
Your post is really impressive and it has lots of knowledge in learning.keep share your valuable knowledge with us. redshift
ReplyDeleteYour information was so good. I learn more from this. I am sure you have great knowledge about this. Please share more information like this.
ReplyDeleteredshift