soletechie: July 2019

All Redshift monitoring queries in one place.

NOTE: All queries are provided by Redshift Documentation provided by AWS. Here is the link for all the system tables:

https://docs.aws.amazon.com/redshift/latest/dg/cm_chap_system-tables.html

And the Redshift Utility information:

https://github.com/awslabs/amazon-redshift-utils

--reason why s3 copy failed

select d.query, substring(d.filename,14,20),

d.line_number as line,

substring(d.value,1,16) as value,

substring(le.err_reason,1,48) as err_reason

from stl_loaderror_detail d, stl_load_errors le

where d.query = le.query

and d.query = <queryID>;

--copy performance information from past 7 days

select q.starttime, s.query, substring(q.querytxt,1,120) as querytxt,

s.n_files, size_mb, s.time_seconds,

s.size_mb/decode(s.time_seconds,0,1,s.time_seconds) as mb_per_s

from (select query, count(*) as n_files,

sum(transfer_size/(1024*1024)) as size_MB, (max(end_Time) -

min(start_Time))/(1000000) as time_seconds , max(end_time) as end_time

from stl_s3client where http_method = 'GET' and query > 0

and transfer_time > 0 group by query ) as s

LEFT JOIN stl_Query as q on q.query = s.query

where s.end_Time >= dateadd(day, -7, current_Date)

order by s.time_Seconds desc, size_mb desc, s.end_time desc

limit 50;

--sessions with currently running queries

SELECT s.process AS pid

,date_Trunc ('second',s.starttime) AS S_START

,datediff(minutes,s.starttime,getdate ()) AS conn_mins

,trim(s.user_name) AS USER

,trim(s.db_name) AS DB

,date_trunc ('second',i.starttime) AS Q_START

,i.query

,trim(i.query) AS sql

FROM stv_sessions s

LEFT JOIN stv_recents i

ON s.process = i.pid

AND i.status = 'Running'

WHERE s.user_name <> 'rdsdb'

ORDER BY 1;

/* Query showing information about sessions with currently running queries */

SELECT s.process AS pid

,date_Trunc ('second',s.starttime) AS S_START

,datediff(minutes,s.starttime,getdate ()) AS conn_mins

,trim(s.user_name) AS USER

,trim(s.db_name) AS DB

,date_trunc ('second',i.starttime) AS Q_START

,i.query

,trim(i.query) AS sql

FROM stv_sessions s

LEFT JOIN stv_recents i

ON s.process = i.pid

AND i.status = 'Running'

WHERE s.user_name <> 'rdsdb'

ORDER BY 1;

--running queues:

select trim(u.usename) as user, s.pid, q.xid,q.query,q.service_class as "q", q.slot_count as slt, date_trunc('second',q.wlm_start_time) as start,decode(trim(q.state), 'Running','Run','QueuedWaiting','Queue','Returning','Return',trim(q.state)) as state,

q.queue_Time/1000000 as q_sec, q.exec_time/1000000 as exe_sec, m.cpu_time/1000000 cpu_sec, m.blocks_read read_mb, decode(m.blocks_to_disk,-1,null,m.blocks_to_disk) spill_mb , m2.rows as ret_rows, m3.rows as NL_rows,

substring(replace(nvl(qrytext_cur.text,trim(translate(s.text,chr(10)||chr(13)||chr(9) ,''))),'\\n',' '),1,90) as sql,

trim(decode(event&1,1,'SK ','') || decode(event&2,2,'Del ','') || decode(event&4,4,'NL ','') || decode(event&8,8,'Dist ','') || decode(event&16,16,'Bcast ','') || decode(event&32,32,'Stats ','')) as Alert

from stv_wlm_query_state q

left outer join stl_querytext s on (s.query=q.query and sequence = 0)

left outer join stv_query_metrics m on ( q.query = m.query and m.segment=-1 and m.step=-1 )

left outer join stv_query_metrics m2 on ( q.query = m2.query and m2.step_type = 38 )

left outer join ( select query, sum(rows) as rows from stv_query_metrics m3 where step_type = 15 group by 1) as m3 on ( q.query = m3.query )

left outer join pg_user u on ( s.userid = u.usesysid )

LEFT OUTER JOIN (SELECT ut.xid,'CURSOR ' || TRIM( substring ( TEXT from strpos(upper(TEXT),'SELECT') )) as TEXT

FROM stl_utilitytext ut

WHERE sequence = 0

AND upper(TEXT) like 'DECLARE%'

GROUP BY text, ut.xid) qrytext_cur ON (q.xid = qrytext_cur.xid)

left outer join ( select query,sum(decode(trim(split_part(event,':',1)),'Very selective query filter',1,'Scanned a large number of deleted rows',2,'Nested Loop Join in the query plan',4,'Distributed a large number of rows across the network',8,'Broadcasted a large number of rows across the network',16,'Missing query planner statistics',32,0)) as event from STL_ALERT_EVENT_LOG

where event_time >= dateadd(hour, -8, current_Date) group by query ) as alrt on alrt.query = q.query

order by q.service_class,q.exec_time desc, q.wlm_start_time;

/**********************************************************************************************

Purpose: Return Alerts from past 7 days

Columns:

table: Name of the table where the alert happened, when applicable

minutes: Number of minutes spent doing the action. Not available for all alerts

rows: Number for rows of the scan/dist/bcast

event: What is the Alerted Event

solution Proposed Solution to avoid the alert (and performance issue) in the future

sample_query: query_id of the latest occurency of that alert

count: Number of occurences of the alert

Notes:

History:

2015-02-09 ericfe created

2015-04-17 ericfe Added detail information on distributions and broadcasts. Added rows column

2017-08-11 andrewgross Added schema and query text information.

**********************************************************************************************/

SELECT t.schema AS SCHEMA,

trim(s.perm_table_name) AS TABLE,

(sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), CASE WHEN coalesce(b.endtime,d.endtime,s.endtime) > coalesce(b.starttime,d.starttime,s.starttime) THEN coalesce(b.endtime,d.endtime,s.endtime) ELSE coalesce(b.starttime,d.starttime,s.starttime) END)))/60)::numeric(24,0) AS minutes,

sum(coalesce(b.rows,d.rows,s.rows)) AS ROWS,

trim(split_part(l.event,':',1)) AS event,

substring(trim(l.solution),1,60) AS solution,

max(l.query) AS sample_query,

count(DISTINCT l.query),

q.text AS query_text

FROM stl_alert_event_log AS l

LEFT JOIN stl_scan AS s ON s.query = l.query

AND s.slice = l.slice

AND s.segment = l.segment

LEFT JOIN stl_dist AS d ON d.query = l.query

AND d.slice = l.slice

AND d.segment = l.segment

LEFT JOIN stl_bcast AS b ON b.query = l.query

AND b.slice = l.slice

AND b.segment = l.segment

LEFT JOIN

(SELECT query,

LISTAGG(text) WITHIN

GROUP (

ORDER BY sequence) AS text

FROM stl_querytext

WHERE sequence < 100

GROUP BY query) AS q ON q.query = l.query

LEFT JOIN svv_table_info AS t ON t.table_id = s.tbl

WHERE l.userid >1

AND l.event_time >= dateadd(DAY, -7, CURRENT_DATE)

AND s.perm_table_name NOT LIKE 'volt_tt%'

AND SCHEMA IS NOT NULL

GROUP BY 1,

query_text

ORDER BY 3 DESC, 7 DESC;

/**********************************************************************************************

Purpose: Returns the per-hour Resources usage per queue for the past 2 days.

These results can be used to fine tune WLM queues and find peak times for workload.

Columns:

exec_hour: Hour of execution of queries

q: ID for the service class, defined in the WLM configuration file.

n_cp: Number of queries executed on that queue/hour

avg_q_sec: Average Queueing time in seconds

avg_e_sec: Averagte Executiong time in seconds

avg_pct_cpu: Average percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems

max_pct_cpu: Max percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems

sum_spill_mb: Sum of Spill usage by that queue on that hour

sum_row_scan: Sum of rows scanned on that queue/hour

sum_join_rows: Sum of rows joined on that queue/hour

sum_nl_join_rows: Sum of rows Joined using Nested Loops on that queue/hour

sum_ret_rows: Sum of rows returned to the leader/client on that queue/hour

sum_spec_mb: Sum of Megabytes scanned by a Spectrum query on that queue/hour

Notes:

History:

2017-08-09 ericnf created

2017-12-18 ericnf add rows for cached queries

**********************************************************************************************/

select date_trunc('hour', convert_timezone('utc','utc',w.exec_start_time)) as exec_hour, w.service_class as "Q", sum(decode(w.final_state, 'Completed',1,'Evicted',0,0)) as n_cp, sum(decode(w.final_state, 'Completed',0,'Evicted',1,0)) as n_ev, avg(w.total_queue_time/1000000) as avg_q_sec, avg(w.total_exec_time/1000000) as avg_e_sec,

avg(m.query_cpu_usage_percent) as avg_pct_cpu, max(m.query_cpu_usage_percent) as max_pct_cpu, max(m.query_temp_blocks_to_disk) as max_spill, sum(m.query_temp_blocks_to_disk) as sum_spill_mb, sum(m.scan_row_count) as sum_row_scan, sum(m.join_row_count) as sum_join_rows, sum(m.nested_loop_join_row_count) as sum_nl_join_rows,

sum(m.return_row_count) as sum_ret_rows, sum(m.spectrum_scan_size_mb) as sum_spec_mb

from stl_wlm_query as w left join svl_query_metrics_summary as m using (userid,service_Class, query)

where service_class > 5

and w.exec_start_time >= dateadd(day, -1, current_Date) group by 1,2

union all

select date_trunc('hour', convert_timezone('utc','utc',c.starttime)) as exec_hour, 0 as "Q", sum(decode(c.aborted, 1,0,1)) as n_cp, sum(decode(c.aborted, 1,1,0)) as n_ev, 0 as avg_q_sec, avg(c.elapsed/1000000) as avg_e_sec,

0 as avg_pct_cpu, 0 as max_pct_cpu, 0 as max_spill, 0 as sum_spill_mb, 0 as sum_row_scan, 0 as sum_join_rows, 0 as sum_nl_join_rows, sum(m.return_row_count) as sum_ret_rows, 0 as sum_spec_mb

from svl_qlog c left join svl_query_metrics_summary as m on ( c.userid = m.userid and c.source_query=m.query )

where source_query is not null and c.starttime >= dateadd(day, -1, current_Date)

group by 1,2 order by 1 desc,2 ;

/**********************************************************************************************

Purpose: Return Table related Performance Allerts from past 7 days

Columns:

schemaL Name of Schema

table: Name of the table where the alert happened, when applicable

table_rows: Size of table in rows

minutes: Number of minutes spent doing the action. Not available for all alerts

scanned_rows: Number for rows of the scan/dist/bcast

event: What is the Alerted Event

solution Proposed Solution to avoid the alert (and performance issue) in the future

sample_query: query_id of the latest occurency of that alert

count: Number of occurences of the alert

Notes:

History:

2015-02-09 ericfe created

2018-09-12 ericfe re-publish to differentiate with the one with SQL text

**********************************************************************************************/

select trim(pgn.nspname) as Schema, trim(s.perm_table_name) as table , tot_rows as table_rows, (sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), coalesce(b.endtime,d.endtime,s.endtime))))/60)::numeric(24,0) as minutes,

sum(coalesce(b.rows,d.rows,s.rows)) as scanned_rows, trim(split_part(l.event,':',1)) as event, substring(trim(l.solution),1,60) as solution , max(l.query) as sample_query, count(*) as count

from stl_alert_event_log as l

left join stl_scan as s on s.query = l.query and s.slice = l.slice and s.segment = l.segment

left join stl_dist as d on d.query = l.query and d.slice = l.slice and d.segment = l.segment

left join stl_bcast as b on b.query = l.query and b.slice = l.slice and b.segment = l.segment

left join ( select id, sum(rows) as tot_rows from stv_Tbl_perm group by id) as t on s.tbl = t.id

left join pg_class as pgc on pgc.oid = s.tbl left join pg_namespace as pgn on pgn.oid = pgc.relnamespace

where l.userid >1 and l.event_time >= dateadd(day, -7, getdate()) AND s.perm_table_name NOT LIKE 'volt_tt%' AND s.perm_table_name NOT LIKE 'Internal Worktable'

group by 1,2,3, 6,7 order by 4 desc,8 desc;

/**********************************************************************************************

Purpose: Return Table storage information (size, skew, etc)

Columns:

schema: Schema name

Table: Table name

id: Table id

DistKey: Distribution Key (shows EVEN for event disttributed, ALL for Diststyle ALL)

Skew: Table Skew. Proportion between largest slice and smallest slice (null for diststyle ALL)

Sortkey: First column of Sortkey

#SKs: Number of columns in the compound sortkey

rows: Number of rows

mbytes: Size of the table in Megabytes

Enc: Y if the table has at least one compressed column, N otherwise

pct_enc: Proportion of number of encoded columns to total number of columns

pct_of_total: Size of the table in proportion to the cluster size

pct_stats_off: Measure of staleness of table statistics (real size versus size recorded in stats)

pct_unsorted: Proportion of unsorted rows compared to total rows

Notes:

History:

2015-02-16 ericfe created

2017-03-23 thiyagu Added percentage encoded column metric (pct_enc) and fixes

2017-10-01 mscaer Fixed columns "rows", pct_stats_off, and pct_unsorted to be correct for DISTSTYLE ALL.

**********************************************************************************************/

SELECT TRIM(pgn.nspname) AS SCHEMA,

TRIM(a.name) AS TABLE,

id AS TableId,

decode(pgc.reldiststyle,

0, 'EVEN',

1,det.distkey ,

8,'ALL'

) AS DistKey,

decode(pgc.reldiststyle,

8,NULL,

dist_ratio.ratio::DECIMAL(20,4)

) AS Skew,

det.head_sort AS "SortKey",

det.n_sortkeys AS "#SKs",

CASE WHEN pgc.reldiststyle = 8 THEN a.rows_all_dist ELSE a.rows END AS rows,

b.mbytes,

decode(det.max_enc,

0,'N',

'Y'

) AS Enc,

det.pct_enc,

decode(b.mbytes,

0,0,

((b.mbytes/part.total::DECIMAL)*100)::DECIMAL(20,2)

) AS pct_of_total,

(CASE WHEN a.rows = 0 THEN NULL ELSE

CASE WHEN pgc.reldiststyle = 8 THEN ((a.rows_all_dist - pgc.reltuples)::DECIMAL(20,3) / a.rows_all_dist::DECIMAL(20,3)*100)::DECIMAL(20,2)

ELSE ((a.rows - pgc.reltuples)::DECIMAL(20,3) / a.rows::DECIMAL(20,3)*100)::DECIMAL(20,2) END END

) AS pct_stats_off,

CASE WHEN pgc.reldiststyle = 8

THEN decode( det.n_sortkeys,0, NULL,DECODE( a.rows_all_dist,0,0, (a.unsorted_rows_all_dist::DECIMAL(32)/a.rows_all_dist)*100))::DECIMAL(20,2)

ELSE decode( det.n_sortkeys,0, NULL,DECODE( a.rows,0,0, (a.unsorted_rows::DECIMAL(32)/a.rows)*100))::DECIMAL(20,2) END

AS pct_unsorted

FROM (SELECT db_id,

id,

name,

SUM(ROWS) AS ROWS,

MAX(ROWS) AS rows_all_dist,

SUM(ROWS) - SUM(sorted_rows) AS unsorted_rows,

MAX(ROWS) - MAX(sorted_rows) AS unsorted_rows_all_dist

FROM stv_tbl_perm a

GROUP BY db_id,

id,

name) AS a

JOIN pg_class AS pgc ON pgc.oid = a.id

JOIN pg_namespace AS pgn ON pgn.oid = pgc.relnamespace

LEFT OUTER JOIN (SELECT tbl, COUNT(*) AS mbytes FROM stv_blocklist GROUP BY tbl) b ON a.id = b.tbl

INNER JOIN (SELECT attrelid,

MIN(CASE attisdistkey WHEN 't' THEN attname ELSE NULL END) AS "distkey",

MIN(CASE attsortkeyord WHEN 1 THEN attname ELSE NULL END) AS head_sort,

MAX(attsortkeyord) AS n_sortkeys,

MAX(attencodingtype) AS max_enc,

SUM(case when attencodingtype <> 0 then 1 else 0 end)::DECIMAL(20,3)/COUNT(attencodingtype)::DECIMAL(20,3) *100.00 as pct_enc

FROM pg_attribute

GROUP BY 1) AS det ON det.attrelid = a.id

INNER JOIN (SELECT tbl,

MAX(Mbytes)::DECIMAL(32) /MIN(Mbytes) AS ratio

FROM (SELECT tbl,

TRIM(name) AS name,

slice,

COUNT(*) AS Mbytes

FROM svv_diskusage

GROUP BY tbl,

name,

slice)

GROUP BY tbl,

name) AS dist_ratio ON a.id = dist_ratio.tbl

JOIN (SELECT SUM(capacity) AS total

FROM stv_partitions

WHERE part_begin = 0) AS part ON 1 = 1

WHERE mbytes IS NOT NULL

AND pgc.relowner > 1

-- and pgn.nspname = 'schema' -- schemaname

-- and a.name like 'table%' -- tablename

-- and det.max_enc = 0 -- non-compressed tables

ORDER BY mbytes DESC;

Table Skew Inspector. Please see http://docs.aws.amazon.com/redshift/latest/dg/c_analyzing-table-design.html

for more information.

Notes:

History:

2015-11-26 meyersi created

2016-09-13 chriz-bigdata rewrote to simplify and align with documentation

SELECT SCHEMA schemaname,

"table" tablename,

table_id tableid,

size size_in_mb,

CASE

WHEN diststyle NOT IN ('EVEN','ALL') THEN 1

ELSE 0

END has_dist_key,

CASE

WHEN sortkey1 IS NOT NULL THEN 1

ELSE 0

END has_sort_key,

CASE

WHEN encoded = 'Y' THEN 1

ELSE 0

END has_col_encoding,

ROUND(100*CAST(max_blocks_per_slice - min_blocks_per_slice AS FLOAT) / GREATEST(NVL (min_blocks_per_slice,0)::int,1),2) ratio_skew_across_slices,

ROUND(CAST(100*dist_slice AS FLOAT) /(SELECT COUNT(DISTINCT slice) FROM stv_slices),2) pct_slices_populated

FROM svv_table_info ti

JOIN (SELECT tbl,

MIN(c) min_blocks_per_slice,

MAX(c) max_blocks_per_slice,

COUNT(DISTINCT slice) dist_slice

FROM (SELECT b.tbl,

b.slice,

COUNT(*) AS c

FROM STV_BLOCKLIST b

GROUP BY b.tbl,

b.slice)

WHERE tbl IN (SELECT table_id FROM svv_table_info)

GROUP BY tbl) iq ON iq.tbl = ti.table_id

ORDER BY SCHEMA, "table";

/**********************************************************************************************

Purpose: Return the top 50 time consuming statements aggregated by it's text.

Columns:

DB: Database where the query ran

n_qry: Number of Queries with same SQL text

qrytext: First 80 Characters of the query SQL

min/max/avg: Min/Max/Avg Execution time

total: Total execution time of all occurences

max_query_id: Largest query id of the query occurence

last_run: Last day the query ran

aborted: 0 if query ran to completion, 1 if it was canceled.

alerts: Alert events related to the query

Notes:

There is a commented filter of the query to filter for only Select statements (otherwise it includes all statements like insert, update, COPY)

There is a commented filter to narrow the query to a given database

History:

2015-02-09 ericfe created

2015-04-17 ericfe Added event name and event time filter

**********************************************************************************************/

-- query runtimes

select trim(database) as DB, count(query) as n_qry, max(substring (qrytext,1,80)) as qrytext, min(run_seconds) as "min" , max(run_seconds) as "max", avg(run_seconds) as "avg", sum(run_seconds) as total, max(query) as max_query_id,

max(starttime)::date as last_run, aborted,

listagg(event, ', ') within group (order by query) as events

from (

select userid, label, stl_query.query, trim(database) as database, trim(querytxt) as qrytext, md5(trim(querytxt)) as qry_md5, starttime, endtime, datediff(seconds, starttime,endtime)::numeric(12,2) as run_seconds,

aborted, decode(alrt.event,'Very selective query filter','Filter','Scanned a large number of deleted rows','Deleted','Nested Loop Join in the query plan','Nested Loop','Distributed a large number of rows across the network','Distributed','Broadcasted a large number of rows across the network','Broadcast','Missing query planner statistics','Stats',alrt.event) as event

from stl_query

left outer join ( select query, trim(split_part(event,':',1)) as event from STL_ALERT_EVENT_LOG where event_time >= dateadd(day, -7, current_Date) group by query, trim(split_part(event,':',1)) ) as alrt on alrt.query = stl_query.query

where userid <> 1

-- and (querytxt like 'SELECT%' or querytxt like 'select%' )

-- and database = ''

and starttime >= dateadd(day, -7, current_Date)

)

group by database, label, qry_md5, aborted

order by total desc limit 50;

soletechie

Friday, July 26, 2019

Monitoring Queries for Redshift

Wednesday, July 24, 2019

Presto connectivity to Redshift

Tuesday, July 23, 2019

Issues related to S3 COPY command for Redshift

Pages

Search This Blog