Friday, July 26, 2019

Monitoring Queries for Redshift


All Redshift monitoring queries in one place.
NOTE: All queries are provided by Redshift Documentation provided by AWS. Here is the link for all the system tables:
And the Redshift Utility information:


--reason why s3 copy failed
select d.query, substring(d.filename,14,20),
d.line_number as line,
substring(d.value,1,16) as value,
substring(le.err_reason,1,48) as err_reason
from stl_loaderror_detail d, stl_load_errors le
where d.query = le.query
and d.query = <queryID>;


--copy performance information from past 7 days
select q.starttime,  s.query, substring(q.querytxt,1,120) as querytxt,
       s.n_files, size_mb, s.time_seconds,
       s.size_mb/decode(s.time_seconds,0,1,s.time_seconds)  as mb_per_s
from (select query, count(*) as n_files,
     sum(transfer_size/(1024*1024)) as size_MB, (max(end_Time) -
         min(start_Time))/(1000000) as time_seconds , max(end_time) as end_time
      from stl_s3client where http_method = 'GET' and query > 0
       and transfer_time > 0 group by query ) as s
LEFT JOIN stl_Query as q on q.query = s.query
where s.end_Time >=  dateadd(day, -7, current_Date)
order by s.time_Seconds desc, size_mb desc, s.end_time desc
limit 50;


--sessions with currently running queries
SELECT s.process AS pid
       ,date_Trunc ('second',s.starttime) AS S_START
       ,datediff(minutes,s.starttime,getdate ()) AS conn_mins
       ,trim(s.user_name) AS USER
       ,trim(s.db_name) AS DB
       ,date_trunc ('second',i.starttime) AS Q_START
       ,i.query
       ,trim(i.query) AS sql
FROM stv_sessions s
  LEFT JOIN stv_recents i
         ON s.process = i.pid
        AND i.status = 'Running'
WHERE s.user_name <> 'rdsdb'
ORDER BY 1;

    
/* Query showing information about sessions with currently running queries */
SELECT s.process AS pid
       ,date_Trunc ('second',s.starttime) AS S_START
       ,datediff(minutes,s.starttime,getdate ()) AS conn_mins
       ,trim(s.user_name) AS USER
       ,trim(s.db_name) AS DB
       ,date_trunc ('second',i.starttime) AS Q_START
       ,i.query
       ,trim(i.query) AS sql
FROM stv_sessions s
  LEFT JOIN stv_recents i
         ON s.process = i.pid
        AND i.status = 'Running'
WHERE s.user_name <> 'rdsdb'
ORDER BY 1;

--running queues:
select trim(u.usename) as user, s.pid, q.xid,q.query,q.service_class as "q", q.slot_count as slt, date_trunc('second',q.wlm_start_time) as start,decode(trim(q.state), 'Running','Run','QueuedWaiting','Queue','Returning','Return',trim(q.state)) as state, 
q.queue_Time/1000000 as q_sec, q.exec_time/1000000 as exe_sec, m.cpu_time/1000000 cpu_sec, m.blocks_read read_mb, decode(m.blocks_to_disk,-1,null,m.blocks_to_disk) spill_mb , m2.rows as ret_rows, m3.rows as NL_rows,
substring(replace(nvl(qrytext_cur.text,trim(translate(s.text,chr(10)||chr(13)||chr(9) ,''))),'\\n',' '),1,90) as sql,
trim(decode(event&1,1,'SK ','') || decode(event&2,2,'Del ','') || decode(event&4,4,'NL ','') ||  decode(event&8,8,'Dist ','') || decode(event&16,16,'Bcast ','') || decode(event&32,32,'Stats ','')) as Alert
from  stv_wlm_query_state q 
left outer join stl_querytext s on (s.query=q.query and sequence = 0)
left outer join stv_query_metrics m on ( q.query = m.query and m.segment=-1 and m.step=-1 )
left outer join stv_query_metrics m2 on ( q.query = m2.query and m2.step_type = 38 )
left outer join ( select query, sum(rows) as rows from stv_query_metrics m3 where step_type = 15 group by 1) as m3 on ( q.query = m3.query )
left outer join pg_user u on ( s.userid = u.usesysid )
LEFT OUTER JOIN (SELECT ut.xid,'CURSOR ' || TRIM( substring ( TEXT from strpos(upper(TEXT),'SELECT') )) as TEXT
                   FROM stl_utilitytext ut
                   WHERE sequence = 0
                   AND upper(TEXT) like 'DECLARE%'
                   GROUP BY text, ut.xid) qrytext_cur ON (q.xid = qrytext_cur.xid)
left outer join ( select query,sum(decode(trim(split_part(event,':',1)),'Very selective query filter',1,'Scanned a large number of deleted rows',2,'Nested Loop Join in the query plan',4,'Distributed a large number of rows across the network',8,'Broadcasted a large number of rows across the network',16,'Missing query planner statistics',32,0)) as event from STL_ALERT_EVENT_LOG 
     where event_time >=  dateadd(hour, -8, current_Date) group by query  ) as alrt on alrt.query = q.query
order by q.service_class,q.exec_time desc, q.wlm_start_time;


    
/**********************************************************************************************
Purpose: Return Alerts from past 7 days
Columns:
table:     Name of the table where the alert happened, when applicable
minutes:     Number of minutes spent doing the action. Not available for all alerts
rows:           Number for rows of the scan/dist/bcast
event:     What is the Alerted Event
solution     Proposed Solution to avoid the alert (and performance issue) in the future
sample_query: query_id of the latest occurency of that alert
count:     Number of occurences of the alert
Notes:
History:
2015-02-09 ericfe created
2015-04-17 ericfe Added detail information on distributions and broadcasts. Added rows column
2017-08-11 andrewgross Added schema and query text information.
**********************************************************************************************/
SELECT t.schema AS SCHEMA,
       trim(s.perm_table_name) AS TABLE,
       (sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), CASE WHEN coalesce(b.endtime,d.endtime,s.endtime) > coalesce(b.starttime,d.starttime,s.starttime) THEN coalesce(b.endtime,d.endtime,s.endtime) ELSE coalesce(b.starttime,d.starttime,s.starttime) END)))/60)::numeric(24,0) AS minutes,
       sum(coalesce(b.rows,d.rows,s.rows)) AS ROWS,
       trim(split_part(l.event,':',1)) AS event,
       substring(trim(l.solution),1,60) AS solution,
       max(l.query) AS sample_query,
       count(DISTINCT l.query),
       q.text AS query_text
FROM stl_alert_event_log AS l
LEFT JOIN stl_scan AS s ON s.query = l.query
AND s.slice = l.slice
AND s.segment = l.segment
LEFT JOIN stl_dist AS d ON d.query = l.query
AND d.slice = l.slice
AND d.segment = l.segment
LEFT JOIN stl_bcast AS b ON b.query = l.query
AND b.slice = l.slice
AND b.segment = l.segment
LEFT JOIN
  (SELECT query,
          LISTAGG(text) WITHIN
   GROUP (
          ORDER BY sequence) AS text
   FROM stl_querytext
   WHERE sequence < 100
   GROUP BY query) AS q ON q.query = l.query
LEFT JOIN svv_table_info AS t ON t.table_id = s.tbl
WHERE l.userid >1
  AND l.event_time >= dateadd(DAY, -7, CURRENT_DATE)
  AND s.perm_table_name NOT LIKE 'volt_tt%'
  AND SCHEMA IS NOT NULL
GROUP BY 1,
         2,
         5,
         6,
         query_text
ORDER BY 3 DESC, 7 DESC;


/**********************************************************************************************
Purpose: Returns the per-hour Resources usage per queue for the past 2 days. 
These results can be used to fine tune WLM queues and find peak times for workload.
    
Columns:

   exec_hour: Hour of execution of queries
   q: ID for the service class, defined in the WLM configuration file. 
   n_cp: Number of queries executed on that queue/hour
   avg_q_sec: Average Queueing time in seconds
   avg_e_sec: Averagte Executiong time in seconds
   avg_pct_cpu: Average percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems
   max_pct_cpu: Max percentage of CPU used by the query. Value can be more than 100% for multi-cpu/slice systems
   sum_spill_mb: Sum of Spill usage by that queue on that hour
   sum_row_scan: Sum of rows scanned on that queue/hour
   sum_join_rows: Sum of rows joined on that queue/hour
   sum_nl_join_rows: Sum of rows Joined using Nested Loops on that queue/hour
   sum_ret_rows: Sum of rows returned to the leader/client on that queue/hour
   sum_spec_mb: Sum of Megabytes scanned by a Spectrum query on that queue/hour
   
Notes:


History:

2017-08-09 ericnf created
2017-12-18 ericnf add rows for cached queries
**********************************************************************************************/
select date_trunc('hour', convert_timezone('utc','utc',w.exec_start_time)) as exec_hour, w.service_class as "Q", sum(decode(w.final_state, 'Completed',1,'Evicted',0,0)) as n_cp,  sum(decode(w.final_state, 'Completed',0,'Evicted',1,0)) as n_ev, avg(w.total_queue_time/1000000) as avg_q_sec, avg(w.total_exec_time/1000000) as avg_e_sec,
       avg(m.query_cpu_usage_percent) as avg_pct_cpu, max(m.query_cpu_usage_percent) as max_pct_cpu, max(m.query_temp_blocks_to_disk) as max_spill, sum(m.query_temp_blocks_to_disk) as sum_spill_mb, sum(m.scan_row_count) as sum_row_scan, sum(m.join_row_count) as sum_join_rows, sum(m.nested_loop_join_row_count) as sum_nl_join_rows, 
       sum(m.return_row_count) as sum_ret_rows, sum(m.spectrum_scan_size_mb) as sum_spec_mb
from   stl_wlm_query as w left join svl_query_metrics_summary as m using (userid,service_Class, query)
where  service_class > 5 
  and     w.exec_start_time >=  dateadd(day, -1, current_Date) group by 1,2 
union all
select date_trunc('hour', convert_timezone('utc','utc',c.starttime)) as exec_hour, 0 as "Q", sum(decode(c.aborted, 1,0,1)) as n_cp,  sum(decode(c.aborted, 1,1,0)) as n_ev, 0 as avg_q_sec, avg(c.elapsed/1000000) as avg_e_sec,
       0 as avg_pct_cpu, 0 as max_pct_cpu, 0 as max_spill, 0 as sum_spill_mb, 0 as sum_row_scan, 0 as sum_join_rows, 0 as sum_nl_join_rows, sum(m.return_row_count) as sum_ret_rows, 0 as sum_spec_mb
from svl_qlog c left join svl_query_metrics_summary as m on ( c.userid = m.userid and c.source_query=m.query ) 
 where source_query is not null and     c.starttime >=  dateadd(day, -1, current_Date)
group by 1,2  order by  1 desc,2 ;


/**********************************************************************************************
Purpose: Return Table related Performance Allerts from past 7 days

Columns:
schemaL Name of Schema
table:     Name of the table where the alert happened, when applicable
table_rows: Size of table in rows
minutes:     Number of minutes spent doing the action. Not available for all alerts
scanned_rows:   Number for rows of the scan/dist/bcast
event:     What is the Alerted Event
solution     Proposed Solution to avoid the alert (and performance issue) in the future
sample_query: query_id of the latest occurency of that alert
count:     Number of occurences of the alert

Notes:

History:
2015-02-09 ericfe created
2018-09-12 ericfe re-publish to differentiate with the one with SQL text
**********************************************************************************************/
select trim(pgn.nspname) as Schema, trim(s.perm_table_name) as table , tot_rows as table_rows, (sum(abs(datediff(seconds, coalesce(b.starttime,d.starttime,s.starttime), coalesce(b.endtime,d.endtime,s.endtime))))/60)::numeric(24,0) as minutes, 
       sum(coalesce(b.rows,d.rows,s.rows)) as scanned_rows, trim(split_part(l.event,':',1)) as event,  substring(trim(l.solution),1,60) as solution , max(l.query) as sample_query, count(*) as count
from stl_alert_event_log as l 
left join stl_scan as s on s.query = l.query and s.slice = l.slice and s.segment = l.segment
left join stl_dist as d on d.query = l.query and d.slice = l.slice and d.segment = l.segment 
left join stl_bcast as b on b.query = l.query and b.slice = l.slice and b.segment = l.segment
left join ( select id, sum(rows) as tot_rows from stv_Tbl_perm group by id) as t on s.tbl = t.id 
left join pg_class as pgc on pgc.oid = s.tbl left join pg_namespace as pgn on pgn.oid = pgc.relnamespace
where l.userid >1  and  l.event_time >=  dateadd(day, -7, getdate()) AND s.perm_table_name NOT LIKE 'volt_tt%' AND s.perm_table_name NOT LIKE 'Internal Worktable'
group by  1,2,3, 6,7 order by 4 desc,8 desc;

/**********************************************************************************************
Purpose: Return Table storage information (size, skew, etc)
Columns:
schema: Schema name
Table: Table name
id: Table id
DistKey: Distribution Key (shows EVEN for event disttributed, ALL for Diststyle ALL)
Skew: Table Skew. Proportion between largest slice and smallest slice (null for diststyle ALL)
Sortkey: First column of Sortkey
#SKs: Number of columns in the compound sortkey
rows: Number of rows
mbytes: Size of the table in Megabytes
Enc: Y if the table has at least one compressed column, N otherwise
pct_enc:        Proportion of number of encoded columns to total number of columns
pct_of_total: Size of the table in proportion to the cluster size
pct_stats_off: Measure of staleness of table statistics (real size versus size recorded in stats)
pct_unsorted: Proportion of unsorted rows compared to total rows
Notes:

History:
2015-02-16 ericfe created
2017-03-23 thiyagu Added percentage encoded column metric (pct_enc) and fixes  
2017-10-01 mscaer Fixed columns "rows", pct_stats_off, and pct_unsorted to be correct for DISTSTYLE ALL.
**********************************************************************************************/

SELECT TRIM(pgn.nspname) AS SCHEMA,
       TRIM(a.name) AS TABLE,
       id AS TableId,
       decode(pgc.reldiststyle,
             0, 'EVEN',
             1,det.distkey ,
             8,'ALL'
       ) AS DistKey,
       decode(pgc.reldiststyle,
             8,NULL,
             dist_ratio.ratio::DECIMAL(20,4)
       ) AS Skew,
       det.head_sort AS "SortKey",
       det.n_sortkeys AS "#SKs",
       CASE WHEN pgc.reldiststyle = 8 THEN a.rows_all_dist ELSE a.rows END AS rows,
       b.mbytes,
       decode(det.max_enc,
             0,'N',
             'Y'
       ) AS Enc,
       det.pct_enc,
       decode(b.mbytes,
             0,0,
             ((b.mbytes/part.total::DECIMAL)*100)::DECIMAL(20,2)
       ) AS pct_of_total,
       (CASE WHEN a.rows = 0 THEN NULL ELSE 
          CASE WHEN pgc.reldiststyle = 8 THEN ((a.rows_all_dist - pgc.reltuples)::DECIMAL(20,3) / a.rows_all_dist::DECIMAL(20,3)*100)::DECIMAL(20,2)
                ELSE ((a.rows - pgc.reltuples)::DECIMAL(20,3) / a.rows::DECIMAL(20,3)*100)::DECIMAL(20,2) END END
       ) AS pct_stats_off,
       CASE WHEN pgc.reldiststyle = 8 
          THEN decode( det.n_sortkeys,0, NULL,DECODE( a.rows_all_dist,0,0, (a.unsorted_rows_all_dist::DECIMAL(32)/a.rows_all_dist)*100))::DECIMAL(20,2)
          ELSE decode( det.n_sortkeys,0, NULL,DECODE( a.rows,0,0, (a.unsorted_rows::DECIMAL(32)/a.rows)*100))::DECIMAL(20,2) END
        AS pct_unsorted
FROM (SELECT db_id,
             id,
             name,
             SUM(ROWS) AS ROWS,
             MAX(ROWS) AS rows_all_dist,
             SUM(ROWS) - SUM(sorted_rows) AS unsorted_rows,
             MAX(ROWS) - MAX(sorted_rows) AS unsorted_rows_all_dist
      FROM stv_tbl_perm a
      GROUP BY db_id,
               id,
               name) AS a
  JOIN pg_class AS pgc ON pgc.oid = a.id
  JOIN pg_namespace AS pgn ON pgn.oid = pgc.relnamespace
  LEFT OUTER JOIN (SELECT tbl, COUNT(*) AS mbytes FROM stv_blocklist GROUP BY tbl) b ON a.id = b.tbl
  INNER JOIN (SELECT attrelid,
                     MIN(CASE attisdistkey WHEN 't' THEN attname ELSE NULL END) AS "distkey",
                     MIN(CASE attsortkeyord WHEN 1 THEN attname ELSE NULL END) AS head_sort,
                     MAX(attsortkeyord) AS n_sortkeys,
                     MAX(attencodingtype) AS max_enc,
                     SUM(case when attencodingtype <> 0 then 1 else 0 end)::DECIMAL(20,3)/COUNT(attencodingtype)::DECIMAL(20,3)  *100.00 as pct_enc
              FROM pg_attribute
              GROUP BY 1) AS det ON det.attrelid = a.id
  INNER JOIN (SELECT tbl,
                     MAX(Mbytes)::DECIMAL(32) /MIN(Mbytes) AS ratio
              FROM (SELECT tbl,
                           TRIM(name) AS name,
                           slice,
                           COUNT(*) AS Mbytes
                    FROM svv_diskusage
                    GROUP BY tbl,
                             name,
                             slice)
              GROUP BY tbl,
                       name) AS dist_ratio ON a.id = dist_ratio.tbl
  JOIN (SELECT SUM(capacity) AS total
        FROM stv_partitions
        WHERE part_begin = 0) AS part ON 1 = 1
WHERE mbytes IS NOT NULL
AND   pgc.relowner > 1
-- and pgn.nspname = 'schema' -- schemaname
-- and a.name like 'table%' -- tablename
-- and det.max_enc = 0 -- non-compressed tables
ORDER BY mbytes DESC;


/*

Table Skew Inspector. Please see http://docs.aws.amazon.com/redshift/latest/dg/c_analyzing-table-design.html
for more information.

Notes:


History:
2015-11-26 meyersi created
2016-09-13 chriz-bigdata rewrote to simplify and align with documentation

*/
SELECT SCHEMA schemaname,
       "table" tablename,
       table_id tableid,
       size size_in_mb,
       CASE
         WHEN diststyle NOT IN ('EVEN','ALL') THEN 1
         ELSE 0
       END has_dist_key,
       CASE
         WHEN sortkey1 IS NOT NULL THEN 1
         ELSE 0
       END has_sort_key,
       CASE
         WHEN encoded = 'Y' THEN 1
         ELSE 0
       END has_col_encoding,
       ROUND(100*CAST(max_blocks_per_slice - min_blocks_per_slice AS FLOAT) / GREATEST(NVL (min_blocks_per_slice,0)::int,1),2) ratio_skew_across_slices,
       ROUND(CAST(100*dist_slice AS FLOAT) /(SELECT COUNT(DISTINCT slice) FROM stv_slices),2) pct_slices_populated
FROM svv_table_info ti
  JOIN (SELECT tbl,
               MIN(c) min_blocks_per_slice,
               MAX(c) max_blocks_per_slice,
               COUNT(DISTINCT slice) dist_slice
        FROM (SELECT b.tbl,
                     b.slice,
                     COUNT(*) AS c
              FROM STV_BLOCKLIST b
              GROUP BY b.tbl,
                       b.slice)
        WHERE tbl IN (SELECT table_id FROM svv_table_info)
        GROUP BY tbl) iq ON iq.tbl = ti.table_id
ORDER BY SCHEMA, "table";


/**********************************************************************************************
Purpose: Return the top 50 time consuming statements aggregated by it's text.

Columns:
DB: Database where the query ran
n_qry: Number of Queries with same SQL text
qrytext: First 80 Characters of the query SQL
min/max/avg: Min/Max/Avg Execution time
total: Total execution time of all occurences
max_query_id: Largest query id of the query occurence
last_run: Last day the query ran
aborted: 0 if query ran to completion, 1 if it was canceled.
alerts: Alert events related to the query

Notes:
There is a commented filter of the query to filter for only Select statements (otherwise it includes all statements like insert, update, COPY)
There is a commented filter to narrow the query to a given database

History:
2015-02-09 ericfe created
2015-04-17 ericfe Added event name and event time filter
**********************************************************************************************/
-- query runtimes
select trim(database) as DB, count(query) as n_qry, max(substring (qrytext,1,80)) as qrytext, min(run_seconds) as "min" , max(run_seconds) as "max", avg(run_seconds) as "avg", sum(run_seconds) as total,  max(query) as max_query_id, 
max(starttime)::date as last_run, aborted,
listagg(event, ', ') within group (order by query) as events
from (
select userid, label, stl_query.query, trim(database) as database, trim(querytxt) as qrytext, md5(trim(querytxt)) as qry_md5, starttime, endtime, datediff(seconds, starttime,endtime)::numeric(12,2) as run_seconds, 
       aborted, decode(alrt.event,'Very selective query filter','Filter','Scanned a large number of deleted rows','Deleted','Nested Loop Join in the query plan','Nested Loop','Distributed a large number of rows across the network','Distributed','Broadcasted a large number of rows across the network','Broadcast','Missing query planner statistics','Stats',alrt.event) as event
from stl_query 
left outer join ( select query, trim(split_part(event,':',1)) as event from STL_ALERT_EVENT_LOG where event_time >=  dateadd(day, -7, current_Date)  group by query, trim(split_part(event,':',1)) ) as alrt on alrt.query = stl_query.query
where userid <> 1 
-- and (querytxt like 'SELECT%' or querytxt like 'select%' ) 
-- and database = ''
and starttime >=  dateadd(day, -7, current_Date)
 ) 
group by database, label, qry_md5, aborted
order by total desc limit 50;

2 comments:

  1. Your post is really impressive and it has lots of knowledge in learning.keep share your valuable knowledge with us. redshift

    ReplyDelete
  2. Your information was so good. I learn more from this. I am sure you have great knowledge about this. Please share more information like this.
    redshift

    ReplyDelete