soletechie: Monitoring Queries for Redshift

All Redshift monitoring queries in one place.

NOTE: All queries are provided by Redshift Documentation provided by AWS. Here is the link for all the system tables:

https://docs.aws.amazon.com/redshift/latest/dg/cm_chap_system-tables.html

And the Redshift Utility information:

https://github.com/awslabs/amazon-redshift-utils

--reason why s3 copy failed

select d.query, substring(d.filename,14,20),

d.line_number as line,

substring(d.value,1,16) as value,

substring(le.err_reason,1,48) as err_reason

from stl_loaderror_detail d, stl_load_errors le

where d.query = le.query

and d.query = <queryID>;

--copy performance information from past 7 days

select q.starttime, s.query, substring(q.querytxt,1,120) as querytxt,

s.n_files, size_mb, s.time_seconds,

s.size_mb/decode(s.time_seconds,0,1,s.time_seconds) as mb_per_s

from (select query, count(*) as n_files,

sum(transfer_size/(1024*1024)) as size_MB, (max(end_Time) -

min(start_Time))/(1000000) as time_seconds , max(end_time) as end_time

from stl_s3client where http_method = 'GET' and query > 0

and transfer_time > 0 group by query ) as s

LEFT JOIN stl_Query as q on q.query = s.query

where s.end_Time >= dateadd(day, -7, current_Date)

order by s.time_Seconds desc, size_mb desc, s.end_time desc

limit 50;

--sessions with currently running queries

SELECT s.process AS pid

,date_Trunc ('second',s.starttime) AS S_START

,datediff(minutes,s.starttime,getdate ()) AS conn_mins

,trim(s.user_name) AS USER

,trim(s.db_name) AS DB

,date_trunc ('second',i.starttime) AS Q_START

,i.query

,trim(i.query) AS sql

FROM stv_sessions s

LEFT JOIN stv_recents i

ON s.process = i.pid

AND i.status = 'Running'

WHERE s.user_name <> 'rdsdb'

ORDER BY 1;

/* Query showing information about sessions with currently running queries */

SELECT s.process AS pid

,date_Trunc ('second',s.starttime) AS S_START

,datediff(minutes,s.starttime,getdate ()) AS conn_mins

,trim(s.user_name) AS USER

,trim(s.db_name) AS DB

,date_trunc ('second',i.starttime) AS Q_START

,i.query

,trim(i.query) AS sql

FROM stv_sessions s

LEFT JOIN stv_recents i

ON s.process = i.pid

AND i.status = 'Running'

WHERE s.user_name <> 'rdsdb'

ORDER BY 1;

--running queues:

select trim(u.usename) as user, s.pid, q.xid,q.query,q.service_class as "q", q.slot_count as slt, date_trunc('second',q.wlm_start_time) as start,decode(trim(q.state), 'Running','Run','QueuedWaiting','Queue','Returning','Return',trim(q.state)) as state,

q.queue_Time/1000000 as q_sec, q.exec_time/1000000 as exe_sec, m.cpu_time/1000000 cpu_sec, m.blocks_read read_mb, decode(m.blocks_to_disk,-1,null,m.blocks_to_disk) spill_mb , m2.rows as ret_rows, m3.rows as NL_rows,

substring(replace(nvl(qrytext_cur.text,trim(translate(s.text,chr(10)||chr(13)||chr(9) ,''))),'\\n',' '),1,90) as sql,

trim(decode(event&1,1,'SK ','') || decode(event&2,2,'Del ','') || decode(event&4,4,'NL ','') || decode(event&8,8,'Dist ','') || decode(event&16,16,'Bcast ','') || decode(event&32,32,'Stats ','')) as Alert

from stv_wlm_query_state q

left outer join stl_querytext s on (s.query=q.query and sequence = 0)

left outer join stv_query_metrics m on ( q.query = m.query and m.segment=-1 and m.step=-1 )

left outer join stv_query_metrics m2 on ( q.query = m2.query and m2.step_type = 38 )

left outer join ( select query, sum(rows) as rows from stv_query_metrics m3 where step_type = 15 group by 1) as m3 on ( q.query = m3.query )

left outer join pg_user u on ( s.userid = u.usesysid )

LEFT OUTER JOIN (SELECT ut.xid,'CURSOR ' || TRIM( substring ( TEXT from strpos(upper(TEXT),'SELECT') )) as TEXT

FROM stl_utilitytext ut

WHERE sequence = 0

AND upper(TEXT) like 'DECLARE%'

GROUP BY text, ut.xid) qrytext_cur ON (q.xid = qrytext_cur.xid)

left outer join ( select query,sum(decode(trim(split_part(event,':',1)),'Very selective query filter',1,'Scanned a large number of deleted rows',2,'Nested Loop Join in the query plan',4,'Distributed a large number of rows across the network',8,'Broadcasted a large number of rows across the network',16,'Missing query planner statistics',32,0)) as event from STL_ALERT_EVENT_LOG

where event_time >= dateadd(hour, -8, current_Date) group by query ) as alrt on alrt.query = q.query

order by q.service_class,q.exec_time desc, q.wlm_start_time;

/**********************************************************************************************

Purpose: Return Alerts from past 7 days

Columns:

table: Name of the table where the alert happened, when applicable

minutes: Number of minutes spent doing the action. Not available for all alerts

rows: Number for rows of the scan/dist/bcast

event: What is the Alerted Event

solution Proposed Solution to avoid the alert (and performance issue) in the future

sample_query: query_id of the latest occurency of that alert

count: Number of occurences of the alert

Notes:

History:

2015-02-09 ericfe created

2015-04-17 ericfe Added detail information on distributions and broadcasts. Added rows column

2017-08-11 andrewgross Added schema and query text information.

**********************************************************************************************/

SELECT t.schema AS SCHEMA,

trim(s.perm_table_name) AS TABLE,

(sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), CASE WHEN coalesce(b.endtime,d.endtime,s.endtime) > coalesce(b.starttime,d.starttime,s.starttime) THEN coalesce(b.endtime,d.endtime,s.endtime) ELSE coalesce(b.starttime,d.starttime,s.starttime) END)))/60)::numeric(24,0) AS minutes,

sum(coalesce(b.rows,d.rows,s.rows)) AS ROWS,

trim(split_part(l.event,':',1)) AS event,

substring(trim(l.solution),1,60) AS solution,

max(l.query) AS sample_query,

count(DISTINCT l.query),

q.text AS query_text

FROM stl_alert_event_log AS l

LEFT JOIN stl_scan AS s ON s.query = l.query

AND s.slice = l.slice

AND s.segment = l.segment

LEFT JOIN stl_dist AS d ON d.query = l.query

AND d.slice = l.slice

AND d.segment = l.segment

LEFT JOIN stl_bcast AS b ON b.query = l.query

AND b.slice = l.slice

AND b.segment = l.segment

LEFT JOIN

(SELECT query,

LISTAGG(text) WITHIN

GROUP (

ORDER BY sequence) AS text

FROM stl_querytext

WHERE sequence < 100

GROUP BY query) AS q ON q.query = l.query

LEFT JOIN svv_table_info AS t ON t.table_id = s.tbl

WHERE l.userid >1

AND l.event_time >= dateadd(DAY, -7, CURRENT_DATE)

AND s.perm_table_name NOT LIKE 'volt_tt%'

AND SCHEMA IS NOT NULL

GROUP BY 1,

query_text

ORDER BY 3 DESC, 7 DESC;

/**********************************************************************************************

Purpose: Returns the per-hour Resources usage per queue for the past 2 days.

These results can be used to fine tune WLM queues and find peak times for workload.

Columns:

exec_hour: Hour of execution of queries

q: ID for the service class, defined in the WLM configuration file.

n_cp: Number of queries executed on that queue/hour

avg_q_sec: Average Queueing time in seconds

avg_e_sec: Averagte Executiong time in seconds

avg_pct_cpu: Average percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems

max_pct_cpu: Max percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems

sum_spill_mb: Sum of Spill usage by that queue on that hour

sum_row_scan: Sum of rows scanned on that queue/hour

sum_join_rows: Sum of rows joined on that queue/hour

sum_nl_join_rows: Sum of rows Joined using Nested Loops on that queue/hour

sum_ret_rows: Sum of rows returned to the leader/client on that queue/hour

sum_spec_mb: Sum of Megabytes scanned by a Spectrum query on that queue/hour

Notes:

History:

2017-08-09 ericnf created

2017-12-18 ericnf add rows for cached queries

**********************************************************************************************/

select date_trunc('hour', convert_timezone('utc','utc',w.exec_start_time)) as exec_hour, w.service_class as "Q", sum(decode(w.final_state, 'Completed',1,'Evicted',0,0)) as n_cp, sum(decode(w.final_state, 'Completed',0,'Evicted',1,0)) as n_ev, avg(w.total_queue_time/1000000) as avg_q_sec, avg(w.total_exec_time/1000000) as avg_e_sec,

avg(m.query_cpu_usage_percent) as avg_pct_cpu, max(m.query_cpu_usage_percent) as max_pct_cpu, max(m.query_temp_blocks_to_disk) as max_spill, sum(m.query_temp_blocks_to_disk) as sum_spill_mb, sum(m.scan_row_count) as sum_row_scan, sum(m.join_row_count) as sum_join_rows, sum(m.nested_loop_join_row_count) as sum_nl_join_rows,

sum(m.return_row_count) as sum_ret_rows, sum(m.spectrum_scan_size_mb) as sum_spec_mb

from stl_wlm_query as w left join svl_query_metrics_summary as m using (userid,service_Class, query)

where service_class > 5

and w.exec_start_time >= dateadd(day, -1, current_Date) group by 1,2

union all

select date_trunc('hour', convert_timezone('utc','utc',c.starttime)) as exec_hour, 0 as "Q", sum(decode(c.aborted, 1,0,1)) as n_cp, sum(decode(c.aborted, 1,1,0)) as n_ev, 0 as avg_q_sec, avg(c.elapsed/1000000) as avg_e_sec,

0 as avg_pct_cpu, 0 as max_pct_cpu, 0 as max_spill, 0 as sum_spill_mb, 0 as sum_row_scan, 0 as sum_join_rows, 0 as sum_nl_join_rows, sum(m.return_row_count) as sum_ret_rows, 0 as sum_spec_mb

from svl_qlog c left join svl_query_metrics_summary as m on ( c.userid = m.userid and c.source_query=m.query )

where source_query is not null and c.starttime >= dateadd(day, -1, current_Date)

group by 1,2 order by 1 desc,2 ;

/**********************************************************************************************

Purpose: Return Table related Performance Allerts from past 7 days

Columns:

schemaL Name of Schema

table: Name of the table where the alert happened, when applicable

table_rows: Size of table in rows

minutes: Number of minutes spent doing the action. Not available for all alerts

scanned_rows: Number for rows of the scan/dist/bcast

event: What is the Alerted Event

solution Proposed Solution to avoid the alert (and performance issue) in the future

sample_query: query_id of the latest occurency of that alert

count: Number of occurences of the alert

Notes:

History:

2015-02-09 ericfe created

2018-09-12 ericfe re-publish to differentiate with the one with SQL text

**********************************************************************************************/

select trim(pgn.nspname) as Schema, trim(s.perm_table_name) as table , tot_rows as table_rows, (sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), coalesce(b.endtime,d.endtime,s.endtime))))/60)::numeric(24,0) as minutes,

sum(coalesce(b.rows,d.rows,s.rows)) as scanned_rows, trim(split_part(l.event,':',1)) as event, substring(trim(l.solution),1,60) as solution , max(l.query) as sample_query, count(*) as count

from stl_alert_event_log as l

left join stl_scan as s on s.query = l.query and s.slice = l.slice and s.segment = l.segment

left join stl_dist as d on d.query = l.query and d.slice = l.slice and d.segment = l.segment

left join stl_bcast as b on b.query = l.query and b.slice = l.slice and b.segment = l.segment

left join ( select id, sum(rows) as tot_rows from stv_Tbl_perm group by id) as t on s.tbl = t.id

left join pg_class as pgc on pgc.oid = s.tbl left join pg_namespace as pgn on pgn.oid = pgc.relnamespace

where l.userid >1 and l.event_time >= dateadd(day, -7, getdate()) AND s.perm_table_name NOT LIKE 'volt_tt%' AND s.perm_table_name NOT LIKE 'Internal Worktable'

group by 1,2,3, 6,7 order by 4 desc,8 desc;

/**********************************************************************************************

Purpose: Return Table storage information (size, skew, etc)

Columns:

schema: Schema name

Table: Table name

id: Table id

DistKey: Distribution Key (shows EVEN for event disttributed, ALL for Diststyle ALL)

Skew: Table Skew. Proportion between largest slice and smallest slice (null for diststyle ALL)

Sortkey: First column of Sortkey

#SKs: Number of columns in the compound sortkey

rows: Number of rows

mbytes: Size of the table in Megabytes

Enc: Y if the table has at least one compressed column, N otherwise

pct_enc: Proportion of number of encoded columns to total number of columns

pct_of_total: Size of the table in proportion to the cluster size

pct_stats_off: Measure of staleness of table statistics (real size versus size recorded in stats)

pct_unsorted: Proportion of unsorted rows compared to total rows

Notes:

History:

2015-02-16 ericfe created

2017-03-23 thiyagu Added percentage encoded column metric (pct_enc) and fixes

2017-10-01 mscaer Fixed columns "rows", pct_stats_off, and pct_unsorted to be correct for DISTSTYLE ALL.

**********************************************************************************************/

SELECT TRIM(pgn.nspname) AS SCHEMA,

TRIM(a.name) AS TABLE,

id AS TableId,

decode(pgc.reldiststyle,

0, 'EVEN',

1,det.distkey ,

8,'ALL'

) AS DistKey,

decode(pgc.reldiststyle,

8,NULL,

dist_ratio.ratio::DECIMAL(20,4)

) AS Skew,

det.head_sort AS "SortKey",

det.n_sortkeys AS "#SKs",

CASE WHEN pgc.reldiststyle = 8 THEN a.rows_all_dist ELSE a.rows END AS rows,

b.mbytes,

decode(det.max_enc,

0,'N',

'Y'

) AS Enc,

det.pct_enc,

decode(b.mbytes,

0,0,

((b.mbytes/part.total::DECIMAL)*100)::DECIMAL(20,2)

) AS pct_of_total,

(CASE WHEN a.rows = 0 THEN NULL ELSE

CASE WHEN pgc.reldiststyle = 8 THEN ((a.rows_all_dist - pgc.reltuples)::DECIMAL(20,3) / a.rows_all_dist::DECIMAL(20,3)*100)::DECIMAL(20,2)

ELSE ((a.rows - pgc.reltuples)::DECIMAL(20,3) / a.rows::DECIMAL(20,3)*100)::DECIMAL(20,2) END END

) AS pct_stats_off,

CASE WHEN pgc.reldiststyle = 8

THEN decode( det.n_sortkeys,0, NULL,DECODE( a.rows_all_dist,0,0, (a.unsorted_rows_all_dist::DECIMAL(32)/a.rows_all_dist)*100))::DECIMAL(20,2)

ELSE decode( det.n_sortkeys,0, NULL,DECODE( a.rows,0,0, (a.unsorted_rows::DECIMAL(32)/a.rows)*100))::DECIMAL(20,2) END

AS pct_unsorted

FROM (SELECT db_id,

id,

name,

SUM(ROWS) AS ROWS,

MAX(ROWS) AS rows_all_dist,

SUM(ROWS) - SUM(sorted_rows) AS unsorted_rows,

MAX(ROWS) - MAX(sorted_rows) AS unsorted_rows_all_dist

FROM stv_tbl_perm a

GROUP BY db_id,

id,

name) AS a

JOIN pg_class AS pgc ON pgc.oid = a.id

JOIN pg_namespace AS pgn ON pgn.oid = pgc.relnamespace

LEFT OUTER JOIN (SELECT tbl, COUNT(*) AS mbytes FROM stv_blocklist GROUP BY tbl) b ON a.id = b.tbl

INNER JOIN (SELECT attrelid,

MIN(CASE attisdistkey WHEN 't' THEN attname ELSE NULL END) AS "distkey",

MIN(CASE attsortkeyord WHEN 1 THEN attname ELSE NULL END) AS head_sort,

MAX(attsortkeyord) AS n_sortkeys,

MAX(attencodingtype) AS max_enc,

SUM(case when attencodingtype <> 0 then 1 else 0 end)::DECIMAL(20,3)/COUNT(attencodingtype)::DECIMAL(20,3) *100.00 as pct_enc

FROM pg_attribute

GROUP BY 1) AS det ON det.attrelid = a.id

INNER JOIN (SELECT tbl,

MAX(Mbytes)::DECIMAL(32) /MIN(Mbytes) AS ratio

FROM (SELECT tbl,

TRIM(name) AS name,

slice,

COUNT(*) AS Mbytes

FROM svv_diskusage

GROUP BY tbl,

name,

slice)

GROUP BY tbl,

name) AS dist_ratio ON a.id = dist_ratio.tbl

JOIN (SELECT SUM(capacity) AS total

FROM stv_partitions

WHERE part_begin = 0) AS part ON 1 = 1

WHERE mbytes IS NOT NULL

AND pgc.relowner > 1

-- and pgn.nspname = 'schema' -- schemaname

-- and a.name like 'table%' -- tablename

-- and det.max_enc = 0 -- non-compressed tables

ORDER BY mbytes DESC;

Table Skew Inspector. Please see http://docs.aws.amazon.com/redshift/latest/dg/c_analyzing-table-design.html

for more information.

Notes:

History:

2015-11-26 meyersi created

2016-09-13 chriz-bigdata rewrote to simplify and align with documentation

SELECT SCHEMA schemaname,

"table" tablename,

table_id tableid,

size size_in_mb,

CASE

WHEN diststyle NOT IN ('EVEN','ALL') THEN 1

ELSE 0

END has_dist_key,

CASE

WHEN sortkey1 IS NOT NULL THEN 1

ELSE 0

END has_sort_key,

CASE

WHEN encoded = 'Y' THEN 1

ELSE 0

END has_col_encoding,

ROUND(100*CAST(max_blocks_per_slice - min_blocks_per_slice AS FLOAT) / GREATEST(NVL (min_blocks_per_slice,0)::int,1),2) ratio_skew_across_slices,

ROUND(CAST(100*dist_slice AS FLOAT) /(SELECT COUNT(DISTINCT slice) FROM stv_slices),2) pct_slices_populated

FROM svv_table_info ti

JOIN (SELECT tbl,

MIN(c) min_blocks_per_slice,

MAX(c) max_blocks_per_slice,

COUNT(DISTINCT slice) dist_slice

FROM (SELECT b.tbl,

b.slice,

COUNT(*) AS c

FROM STV_BLOCKLIST b

GROUP BY b.tbl,

b.slice)

WHERE tbl IN (SELECT table_id FROM svv_table_info)

GROUP BY tbl) iq ON iq.tbl = ti.table_id

ORDER BY SCHEMA, "table";

/**********************************************************************************************

Purpose: Return the top 50 time consuming statements aggregated by it's text.

Columns:

DB: Database where the query ran

n_qry: Number of Queries with same SQL text

qrytext: First 80 Characters of the query SQL

min/max/avg: Min/Max/Avg Execution time

total: Total execution time of all occurences

max_query_id: Largest query id of the query occurence

last_run: Last day the query ran

aborted: 0 if query ran to completion, 1 if it was canceled.

alerts: Alert events related to the query

Notes:

There is a commented filter of the query to filter for only Select statements (otherwise it includes all statements like insert, update, COPY)

There is a commented filter to narrow the query to a given database

History:

2015-02-09 ericfe created

2015-04-17 ericfe Added event name and event time filter

**********************************************************************************************/

-- query runtimes

select trim(database) as DB, count(query) as n_qry, max(substring (qrytext,1,80)) as qrytext, min(run_seconds) as "min" , max(run_seconds) as "max", avg(run_seconds) as "avg", sum(run_seconds) as total, max(query) as max_query_id,

max(starttime)::date as last_run, aborted,

listagg(event, ', ') within group (order by query) as events

from (

select userid, label, stl_query.query, trim(database) as database, trim(querytxt) as qrytext, md5(trim(querytxt)) as qry_md5, starttime, endtime, datediff(seconds, starttime,endtime)::numeric(12,2) as run_seconds,

aborted, decode(alrt.event,'Very selective query filter','Filter','Scanned a large number of deleted rows','Deleted','Nested Loop Join in the query plan','Nested Loop','Distributed a large number of rows across the network','Distributed','Broadcasted a large number of rows across the network','Broadcast','Missing query planner statistics','Stats',alrt.event) as event

from stl_query

left outer join ( select query, trim(split_part(event,':',1)) as event from STL_ALERT_EVENT_LOG where event_time >= dateadd(day, -7, current_Date) group by query, trim(split_part(event,':',1)) ) as alrt on alrt.query = stl_query.query

where userid <> 1

-- and (querytxt like 'SELECT%' or querytxt like 'select%' )

-- and database = ''

and starttime >= dateadd(day, -7, current_Date)

)

group by database, label, qry_md5, aborted

order by total desc limit 50;

2 comments:

William JessieDecember 21, 2020 at 10:10 PM
Your post is really impressive and it has lots of knowledge in learning.keep share your valuable knowledge with us. redshift
Laura BushJanuary 11, 2021 at 4:38 AM
Your information was so good. I learn more from this. I am sure you have great knowledge about this. Please share more information like this.
redshift

soletechie

Friday, July 26, 2019

Monitoring Queries for Redshift

2 comments:

Pages

Search This Blog