===================================================== 数据库异常处理 ======================================================================
一、当前情况
1、监控当前TOPAS
topas
2、监控当前等待事件
set pagesize 999
set linesize 300
col event format a66
col wait_class format a30
select event,count(*),wait_class from gv$session group by event,wait_class order by 2;
3、通过等待事件检查会话
col machine format a32
col event format a32
col program format a32
set linesize 555
select sid,machine,program,sql_id,event,p1,p2 from v$session where event='&event_list' order by sql_id;
4、监控当前的数据库会话信息
set pagesize 999
set lines 666
col username format a13
col prog format a10 trunc
col sql_text format a41 trunc
col sid format 999999
col child for 999999
col avg_etime for 999,999,.99
col EXECS for 99999999999
select sid,
serial#,
status,
username,
substr(program,1,19) prog,
address,
hash_value,
b.sql_id,
child_number child,
plan_hash_value,
executions execs,
(elapsed_time / decode(nvl(executions,0),0,1,executions)) / 1000000 avg_etime,
sql_text
from v$session a,v$sql b
where status='ACTIVE'
and username is not null
and a.sql_id=b.sql_id
and a.sql_child_number=b.child_number
and sql_text not like
'%select sid,serial#,username,substr(program,1,19) prog,%' --don't show this query
order by sql_id,sql_child_number
/
5、根据以上查询的检查异常SQL_ID,主要检查 PLAN_HASH_VALUE
set lines 155
col execs for 999,999,999
col avg_etime for 999,999.999
col avg_lio for 999,999,999.9
col begin_interval_time for a30
col node for 99999
break on plan_hash_value on startup_time skip 1
select ss.snap_id,
ss.instance_number node,
begin_interval_time,
sql_id,
plan_hash_value,
nvl(executions_delta,0) execs,
(elapsed_time_delta /
decode(nvl(executions_delta,0),0,1,executions_delta))/1000000 avg_etime,
(buffer_gets_delta /
decode(nvl(buffer_gets_delta,0),0,1,executions_delta)) avg_lio,
(disk_reads_delta/
decode(nvl(buffer_gets_delta,0),0,1,executions_delta)) avg_pio
from DBA_HIST_SQLSTAT S,DBA_HIST_SNAPSHOT SS
where sql_id='&sql_id'
and ss.snap_id=S.snap_id
and ss.instance_number=S.instance_number
and s.instance_number like nvl('&instance_number',s.instance_number)
and executions_delta>0
order by 1,2,3
/
6、通过上面的返回查看执行计划是否改变,检查执行计划
set linesize 300
select * from table(DBMS_XPLAN.DISPLAY_CURSOR(to_char('&sql_id'),NULL));
二、非当前情况
1、检查快照信息
select INSTANCE_NUMBER,SNAP_ID,BEGIN_INTERVAL_TIME from DBA_HIST_SNAPSHOT where BEGIN_INTERVAL_TIME>sysdate-1 order by SNAP_ID;
##语句在时间段内的消耗情况(snap_id)
select /*+parallel(t,4)*/
plan_hash_value,
buffer_gets_delta/executions_delta get_exec,
disk_reads_delta/executions_delta read_exec,
cpu_time_delta/executions_delta/1000 cpu_exec_ms ,
elapsed_time_delta/executions_delta/1000 elaps_exec_ms ,
parsing_schema_id,
ROWS_PROCESSED_delta/executions_delta rows_exec,
executions_delta
from dba_hist_sqlstat t
where sql_id = '1mgfnuxggrfj0'
and instance_number = 1
and SNAP_ID between 8978 and 8988
and EXECUTIONS_DELTA > 0;
回放故障期间数据库层等待事件的情况:
col event for a35
set lines 222
set pages 9999
col p1 for 999999999999999999999999999999
select instance_number,sample_id,sample_time,event,count(*) from dba_hist_active_sess_history
where sample_time >
to_timestamp('2018-08-02 11:00:00','yyyy-mm-dd hh24:mi:ss')
and sample_time <
to_timestamp('2018-08-02 11:55:00','yyyy-mm-dd hh24:mi:ss')
--and event ='change tracking file synchronous read'
group by instance_number,sample_id,sample_time,event
order by 1,3,5;
col event for a35
set lines 222
set pages 9999
col p1 for 999999999999999999999999999999
select sample_id,sample_time,event,count(*) from gv$active_session_history;
where INST_ID=2 and event is not null
and sample_id between 43715229 and 43715911
group by sample_id,sample_time,event
order by 1,4;
2、按照快照检查SQL执行信息
set lines 155
col execs for 999,999,999
col min_etime for 999,999.99
col max_etime for 999,999.99
col avg_etime for 999,999.999
col avg_lio for 999,999,999.9
col norm_stddev for 999,999.9999
col begin_interval_time for a30
col node for 99999
break on plan_hash_value on startup_time skip 1
select * from
(select sql_id,
sum(execs),
min(avg_etime) min_etime,
max(avg_etime) max_etime,
stddev_etime / min(avg_etime) norm_stddev
from (select sql_id,
plan_hash_value,
execs,
avg_etime,
stddev(avg_etime) over(partition by sql_id) stddev_etime
from (select sql_id,
plan_hash_value,
sum(nvl(executions_delta,0)) execs,
(sum(executions_delta)/
decode(sum(nvl(executions_delta,0)),
0,
1,
sum(executions_delta)) /1000000) avg_etime
--,sum((buffer_gets_delta/decode(nvl(buffer_gets_delta,0),0,1,executions_delta)) avg_lio
from DBA_HIST_SQLSTAT S,DBA_HIST_SNAPSHOT SS
where ss.snap_id=s.snap_id
and ss.instance_number=s.instance_number
and executions_delta>0
and elapsed_time_delta>0
and s.snap_id > nvl('&start_snap_id',0)
and s.snap_id < nvl('&last_snap_id',0)
group by sql_id,plan_hash_value))
group by sql_id,stddev_etime)
where max_etime>0.01
order by 4
/
3、根据时间检查SQL的信息 (可以修改default的值 当前为0.5天)
alter session set nls_date_format='yyyy-mm-dd hh24:mi:ss';
accept days_ago -
prompt 'Enter Days ago: ' -
default '1'
set lines 155
col execs for 999,999,999
col before_etime for 999,990.99
col after_etime for 999,990.99
col before_avg_etime for 999,990.99 head AVG_ETIME_BEFORE
col after_avg_etime for 999,990.99 head AVG_ETIME_AFTER
col min_etime for 999,990.99
col max_etime for 999,990.99
col avg_etime for 999,990.999
col avg_lio for 999,999,990.9
col norm_stddev for 999,990.9999
col begin_interval_time for a30
col node for 99999
break on plan_hash_value on startup_time skip 1
select * from (
select sql_id, execs, before_avg_etime, after_avg_etime, norm_stddev,
case when to_number(before_avg_etime) < to_number(after_avg_etime) then 'Slower' else 'Faster' end result
-- select *
from (
select sql_id, sum(execs) execs, sum(before_execs) before_execs, sum(after_execs) after_execs,
sum(before_avg_etime) before_avg_etime, sum(after_avg_etime) after_avg_etime,
min(avg_etime) min_etime, max(avg_etime) max_etime, stddev_etime/min(avg_etime) norm_stddev,
case when sum(before_avg_etime) > sum(after_avg_etime) then 'Slower' else 'Faster' end better_or_worse
from (
select sql_id,
period_flag,
execs,
avg_etime,
stddev_etime,
case when period_flag = 'Before' then execs else 0 end before_execs,
case when period_flag = 'Before' then avg_etime else 0 end before_avg_etime,
case when period_flag = 'After' then execs else 0 end after_execs,
case when period_flag = 'After' then avg_etime else 0 end after_avg_etime
from (
select sql_id, period_flag, execs, avg_etime,
stddev(avg_etime) over (partition by sql_id) stddev_etime
from (
select sql_id, period_flag, sum(execs) execs, sum(etime)/sum(decode(execs,0,1,execs)) avg_etime from (
select sql_id, 'Before' period_flag,
nvl(executions_delta,0) execs,
(elapsed_time_delta)/1000000 etime
-- sum((buffer_gets_delta/decode(nvl(buffer_gets_delta,0),0,1,executions_delta))) avg_lio
from DBA_HIST_SQLSTAT S, DBA_HIST_SNAPSHOT SS
where ss.snap_id = S.snap_id
and ss.instance_number = S.instance_number
and executions_delta > 0
and elapsed_time_delta > 0
and ss.begin_interval_time <= sysdate-&&days_ago
union
select sql_id, 'After' period_flag,
nvl(executions_delta,0) execs,
(elapsed_time_delta)/1000000 etime
-- (elapsed_time_delta)/decode(nvl(executions_delta,0),0,1,executions_delta)/1000000 avg_etime
-- sum((buffer_gets_delta/decode(nvl(buffer_gets_delta,0),0,1,executions_delta))) avg_lio
from DBA_HIST_SQLSTAT S, DBA_HIST_SNAPSHOT SS
where ss.snap_id = S.snap_id
and ss.instance_number = S.instance_number
and executions_delta > 0
and elapsed_time_delta > 0
and ss.begin_interval_time > sysdate-&&days_ago
-- and s.snap_id > 7113
)
group by sql_id, period_flag
)
)
)
group by sql_id, stddev_etime
)
)
where after_avg_etime>0.01
order by norm_stddev
/
4、根据以上查询的检查异常SQL_ID,主要检查PLAN_HASH_VALUE
set lines 155
col execs for 999,999,999
col avg_etime for 999,999.999
col avg_lio for 999,999,999.9
col begin_interval_time for a30
col node for 99999
break on plan_hash_value on startup_time skip 1
select ss.snap_id,
ss.instance_number node,
begin_interval_time,
sql_id,
plan_hash_value,
nvl(executions_delta,0) execs,
(elapsed_time_delta/
decode(nvl(executions_delta,0),0,1,executions_delta))/1000000 avg_etime,
(buffer_get_delta/
decode(nvl(buffer_gets_delta,0),0,1,executions_delta)) avg_lio,
(disk_reads_delta/
decode(nvl(buffer_gets_delta,0),0,1,executions_delta)) avg_pio
from DBA_HIST_SQLSTAT S,DBA_HIST_SNAPSHOT SS
where sql_id-'&sql_id'
and ss.snap_id=s.snap_id
and ss.instance_number=s.instance_number
and s.instance_number like '&instance_number'
and executions_delta > 0
order by 1,2,3
/
5、通过上面的返回查看执行计划是否改变,检查执行计划
set pagesize 999
set linesize 380
select * from table(dbms_xplan.display_cursor(to_char('&sql_id'),NULL);
6、根据快照ID查询SQL_ID的来源信息(查所有的视图太大)
set linesize 555
set pagesize 999
col event format a40
col program format a30
col wait_class format a20
select snap_id,instance_number,session_id,sql_id,event,wait_class,p1,p2,p3,blocking_session,blocking_inst_id,program
from dba_hist_active_sess_history
where snap_id > &start_snap_id
and snap_id < &last_snap_id
and sql_id = &sql_id
order by snap_id,program;
===================================================== 等待事件 ======================================================================
##利用触发器记录每个会话的等待事件
在数据库上部署了这个脚本,就能够及时回顾以发现每一个前台进程在数据库中所作的工作。
create table system.session_event_history
tablespace <name>
storage (freelist groups <value>)
initrans <value>
as
select b.sid,
b.serial#,
b.username,
b.osuser,
b.paddr,
b.process,
b.logon_time,
b.type,
a.event,
a.total_waits,
a.total_timeouts,
a.time_waited,
a.average_wait,
a.max_wait,
sysdate as logoff_timestamp
from v$session_event a, v$session b
where 1 = 2;
create table system.sesstat_history
tablespace < name >
storage (freelist groups <value>)
initrans <value>
as
select c.username,
c.osuser,
a.sid,
c.serial#,
c.paddr,
c.process,
c.logon_time,
a.statistic#,
b.name,
a.value,
sysdate as logoff_timestamp
from v$sesstat a, v$statname b, v$session c
where 1 = 2;
create or replace trigger sys.logoff_trig
before logoff on database
declare
logoff_sid pls_integer;
logoff_time date := sysdate;
begin
select sid into logoff_sid from v$mystat where rownum < 2;
insert into system.session_event_history
(sid,
serial#,
username,
osuser,
paddr,
process,
logon_time,
type,
event,
total_waits,
total_timeouts,
time_waited,
average_wait,
max_wait,
logoff_timestamp)
select a.sid,
b.serial#,
b.username,
b.osuser,
b.paddr,
b.process,
b.logon_time,
b.type,
a.event,
a.total_waits,
a.total_timeouts,
a.time_waited,
a.average_wait,
a.max_wait,
logoff_time
from v$session_event a, v$session b
where a.sid = b.sid
and b.username = login_user
and b.sid = logoff_sid;
insert into system.sesstat_history
(username,
osuser,
sid,
serial#,
paddr,
process,
logon_time,
statistic#,
name,
value,
logoff_timestamp)
select c.username,
c.osuser,
a.sid,
c.serial#,
c.paddr,
c.process,
c.logon_time,
a.statistic#,
b.name,
a.value,
logoff_time
from v$sesstat a, v$statname b, v$session c
where a.statistic# = b.statistic#
and a.sid = c.sid
and b.name in ('CPU used when call started', --调用开始时使用的CPU时间。
'CPU used by this session', --会话自用户调用起至结束期间使用CPU时间的总量。
'recursive cpu usage', --非用户调用(递归调用)使用的CPU时间总量。
'parse time cpu') --(硬和软)解析使用的CPU时间的总量。
and c.sid = logoff_sid
and c.username = login_user;
end;
/
##某段时间等待事件情况
select /*+ parallel(d,16) */sql_id,event,count(*) from dba_hist_active_sess_history d
where d.event in ('db file sequential read','gc buffer busy acquire','read by other session','latch: cache buffers chains') and d.dbid=1714394454
and d.sample_time between to_date('2017-03-01 07:00:00','yyyy-mm-dd hh24:mi:ss') and to_date('2017-03-01 09:00:00','yyyy-mm-dd hh24:mi:ss')
and d.instance_number=1 group by sql_id,event order by count(*),sql_id,event;
##等待事件对应的p1,p2,p3含义
col name for a25;
col p1 for a10;
col p2 for a10;
col p3 for a10;
SELECT NAME, PARAMETER1 P1, PARAMETER2 P2, PARAMETER3 P3
FROM V$EVENT_NAME
WHERE NAME = '&event_name';
--sql执行历史情况查询
--v$active_session_history是动态的,ASH保存在内存中,每1秒从v$session_wait中取一次,ASH通常用来分析当前的性能问题,而dba_hist_active_sess_history则是ASH以及AWR采集数据的一个总览表,通常可以通过该视图进行历史数据的访问
--BLOCKING_SESSION是指引起等待的session_id
select to_char(SAMPLE_TIME,'yyyy-mm-dd hh24:mi:ss') SAMPLE_TIME,SESSION_ID,SESSION_STATE,USER_ID,SQL_ID,BLOCKING_SESSION,CURRENT_OBJ# from dba_hist_active_sess_history where
SAMPLE_TIME>to_date('20130825 08:00','yyyymmdd hh24:mi:ss') and event ='enq: TX - row lock contention';
select to_char(SAMPLE_TIME,'yyyy-mm-dd hh24:mi:ss') SAMPLE_TIME,SESSION_ID,USER_ID,SQL_ID,BLOCKING_SESSION,CURRENT_OBJ# from dba_hist_active_sess_history where
SESSION_ID=1598;
select to_char(SAMPLE_TIME,'yyyy-mm-dd hh24:mi:ss') SAMPLE_TIME,SESSION_ID,USER_ID,SQL_ID,BLOCKING_SESSION,CURRENT_OBJ# from v$active_session_history where
SESSION_ID=1598;
select to_char(b.end_interval_time,'yyyy-mm-dd hh24:mi:ss') sample_time, a.*
from dba_hist_sqlstat a,dba_hist_snapshot b
where a.snap_id=b.snap_id and a.sql_id in ('dfafaf','dkafja');
select to_char(SAMPLE_TIME,'yyyy-mm-dd hh24:mi:ss') SAMPLE_TIME,SESSION_ID,SESSION_STATE,USER_ID,event,SQL_ID,BLOCKING_SESSION,CURRENT_OBJ# from dba_hist_active_sess_history where
SAMPLE_TIME>to_date('20130825 08:00','yyyymmdd hh24:mi:ss') ;
##dba_hist_active_sess_history分析
create table tmp_ash_20180920 as select * from dba_hist_active_sess_history
where sample_time between to_date('20180919 02:00','yyyymmdd hh24:mi') and to_date('20180920 05:00','yyyymmdd hh24:mi');
create table tmp_sql_20180920 as select * from WRH$_SQLTEXT where sql_id in (select sql_id from tmp_ash_20180920);
exp "'/ as sysdba'" file=tmp_ash_20180920.dmp tables=tmp_ash_20180920,tmp_sql_20180920
imp "'/ as sysdba'" file=tmp_ash_20170517.dmp full=y
select instance_number,sample_id,sample_time,event,count(*) from tmp_ash_20170517 group by instance_number,sample_id,sample_time,event order by 1,3,5;
col SQL_ID for a20
col EVENT for a20
col SQL_OPNAME for a10
col SAMPLE_TIME for a30
col PROGRAM for a40
select sample_time, session_id, user_id, sql_id, sql_opname,event,blocking_session,program from tmp_ash_20170517 where session_id=4338 order by 1;
select session_id, sample_id, user_id, sql_id, is_sqlid_current,sql_opname,event,blocking_session,top_level_call_name,program from tmp_ash_20170517 where sample_id in (24810796) order by session_id;
##ash中等待事件的变化
SELECT to_char(SAMPLE_TIME,'yyyy-mm-dd hh24:mi:ss') SAMPLE_TIME, event, COUNT(*) FROM dba_hist_active_sess_history
WHERE sample_time between to_date('20170510 00:00','yyyymmdd hh24:mi') and to_date('20170510 00:30','yyyymmdd hh24:mi')
GROUP BY to_char(SAMPLE_TIME,'yyyy-mm-dd hh24:mi:ss'),event ORDER BY 1;
--ASH中每个采样点的top n event --http://feed.askmaclean.com/archives/dba_hist_active_sess_history.html
select t.dbid,
t.sample_id,
t.sample_time,
t.instance_number,
t.event,
t.session_state,
t.c session_count
from (select t.*,
rank() over(partition by dbid, instance_number, sample_time order by c desc) r
from (select /*+ parallel 8 */
t.*,
count(*) over(partition by dbid, instance_number, sample_time, event) c,
row_number() over(partition by dbid, instance_number, sample_time, event order by 1) r1
from dba_hist_active_sess_history t
where sample_time >
to_timestamp('&begin_sample_time',
'yyyy-mm-dd hh24:mi:ss')
and sample_time <
to_timestamp('&end_sample_time',
'yyyy-mm-dd hh24:mi:ss')
) t
where r1 = 1) t
where r < 3
order by dbid, instance_number, sample_time, r;
--观察每个采样点的等待链:
select
level lv,
connect_by_isleaf isleaf,
connect_by_iscycle iscycle,
t.dbid,
t.sample_id,
t.sample_time,
t.instance_number,
t.session_id,
t.sql_id,
t.session_type,
t.event,
t.session_state,
t.blocking_inst_id,
t.blocking_session,
t.blocking_session_status
from m_ash20180322 t
where sample_time >
to_timestamp('2018-03-22 09:59:00',
'yyyy-mm-dd hh24:mi:ss')
and sample_time <
to_timestamp('2018-03-22 10:00:00',
'yyyy-mm-dd hh24:mi:ss')
start with blocking_session is not null
connect by nocycle
prior dbid = dbid
and prior sample_time = sample_time
/*and ((prior sample_time) - sample_time between interval '-1'
second and interval '1' second)*/
and prior blocking_inst_id = instance_number
and prior blocking_session = session_id
and prior blocking_session_serial# = session_serial#
order siblings by dbid, sample_time;
进一步筛选,将isleaf=1的叶(top holder)找出来
--基于上一步的原理来找出每个采样点的最终top holder:
select t.lv,
t.iscycle,
t.dbid,
t.sample_id,
t.sample_time,
t.instance_number,
t.session_id,
t.sql_id,
t.session_type,
t.event,
t.seq#,
t.session_state,
t.blocking_inst_id,
t.blocking_session,
t.blocking_session_status,
t.c blocking_session_count
from (select t.*,
row_number() over(partition by dbid, instance_number, sample_time order by c desc) r
from (select t.*,
count(*) over(partition by dbid, instance_number, sample_time, session_id) c,
row_number() over(partition by dbid, instance_number, sample_time, session_id order by 1) r1
from (select /*+ parallel 8 */
level lv,
connect_by_isleaf isleaf,
connect_by_iscycle iscycle,
t.*
from m_ash20180322 t
where sample_time >
to_timestamp('2018-03-22 09:59:00',
'yyyy-mm-dd hh24:mi:ss')
and sample_time <
to_timestamp('2018-03-22 10:00:00',
'yyyy-mm-dd hh24:mi:ss')
start with blocking_session is not null
connect by nocycle
prior dbid = dbid
and prior sample_time = sample_time
/*and ((prior sample_time) - sample_time between interval '-1'
second and interval '1' second)*/
and prior blocking_inst_id = instance_number
and prior blocking_session = session_id
and prior
blocking_session_serial# = session_serial#) t
where t.isleaf = 1) t
where r1 = 1) t
where r < 3
order by dbid, sample_time, r;
##找出历史坏SQL
使用如下脚本可以监控历史会话经历过哪些等待事件,并且按照等待次数降序排列
select session_id,event,count(*),sum(time_waited) from v$active_session_history where session_state='WAITING'
and time_waited>0 and sample_time>=(sysdate-&howlongago_min/(24*60))
group by session_id,event order by 3 desc;
然后通过如下语句查询出引起该等待事件的sql_id
select event, session_id,sql_id, p1,p2,p3 from v$active_session_history where sample_time>=(sysdate-&howlongago/(24*60)) and session_id=&sid;
再通过如下语句找到引起该等待事件的SQL
select sql_text from v$sqlarea where sql_id='&sql_id';
##找出sql执行消耗
在Oracle 11g中,当SQL满足以下条件之一就会被sql monitor捕获到,监控数据被记录在v$sql_monitor视图中。
■ 当SQL并行执行时,会立即被实时监控到
■ 当SQL单进程运行时,如果消耗超过5秒的CPU或I/O时间,它也会被监控到。(通过修改隐含参数_sqlmon_threshold可以控制这一行为,默认为5秒,如果设置为0将关闭SQL MONITORING功能)
■ 使用/*+ monitor */提示的sql语句:如select /*+ monitor */ * from dba_tables;
set trimspool on
set arraysize 512
set trim on
set pagesize 0
set linesize 1000
set long 1000000
set longchunksize 1000000
spool sqlmon.html
select /*+ noparallel */ dbms_sqltune.report_sql_monitor (
sql_id=>'&sql_id',
report_level=>'ALL',
type=>'ACTIVE')
from dual;
spool off
cat sqlmon.html
##direct path read
direct path read 的原因有以下:
1.大量的磁盘排序操作,无法在排序区中完成排序,需要利用temp表空间进行排序.
2.大量的Hash Join操作,利用temp表空间保存hash区。
3.大表的全表扫描,在Oracle11g中,全表扫描的算法有新的变化,根据表的大小、高速缓存的大小等信息,决定是否绕过SGA直接从磁盘读取数据。而10g则是全部通过高速缓存读取数据,称为table scan(large)。
11g认为大表全表时使用直接路径读,可能比10g中的数据文件散列读(db file scattered reads)速度更快,使用的latch也更少。
##一次ash分析案例
1、获得故障期间按时间分布的活动会话个数的趋势。
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss'), COUNT(1) FROM dba_hist_active_sess_history
WHERE instance_number=1
GROUP BY to_char(sample_time, 'yyyymmdd hh24:mi:ss')
ORDER BY 1;
2、活动会话高峰期的等待事件
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss'), event, COUNT(*) FROM dba_hist_active_sess_history
WHERE to_char(sample_time, 'yyyymmdd hh24:mi:ss')='20161120 20:44'
AND instance_number=1
GROUP BY to_char(sample_time, 'yyyymmdd hh24:mi:ss'), event
HAVING COUNT(*)>=10;
--和RAC global cache相关的等待
3、找到某一瞬间,正在等gc buffer busy acquire的上百个会话,分别在申请哪些BLOCK(p1/p2),被谁阻塞了(blocking_session)。
SELECT p1,p2,blocking_inst_id||'_'||blocking_session block_sid, COUNT(1)
FROM dba_hist_active_sess_history
WHERE to_char(sample_time, 'yyyymmdd hh24:mi:ss')='20161120 20:44:04'
AND instance_number=1
AND event='gc buffer busy acquire'
GROUP BY p1,p2, blocking_inst_id||'_'||blocking_session
ORDER BY COUNT(1);
P1 P2 BLOCK_SID COUNT(1)
--- ------- ----------- --------
47 1540116 1_3200 82
......
可以看到,在开始出现问题的时刻,即20:44:04
节点1有82个会话申请同一个BLOCK在等gc bufferbusy acquire,被节点1 SID为3200的会话阻塞了,即节点1 SID为3200的会话先于82个进程向节点2请求。
4、查看节点1 SID为3200的会话在等什么
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss') sample_time, instance_number||'_'||session_id sid, event, session_state,state, seq#, p1,p2,blocking_inst_id||'_'||blocking_session block_sid
FROM dba_hist_active_sess_history
WHERE instance_number||'_'||session_id='1_3200'
AND to_char(sample_time, 'yyyymmdd hh24:mi:ss') BETWEEN '20161120 20:44:04' AND '20161120 20:44:24'
ORDER BY 1;
SAMPLE_TIME SID EVENT STATE SEQ# P1 P2 BLOCK_SID
------------------- ------- ----------------------- --------- ------- ----- --------- -----------
20161120 20:44:04 1_3200 gc current blcok busy WAITING 7667 47 1540116 _
20161120 20:44:14 1_3200 gc current blcok busy WAITING 7667 47 1540116 _
20161120 20:44:24 1_3200 gc current blcok busy WAITING 7667 47 1540116 _
节点1 SID为3200的会话在请求收到文件号47 ,BLOCK号1540116的时候,无法快速获取,
等待事件是gc current blockbusy,并且这次申请从20:44:04到20:44:24,持续了20秒以上,但是崩溃的是,没有办法知道谁是我的阻塞者了
5、从sqlid出发
既然客户已经提到,有一笔插入报文表的联机交易2016-11-20 20:44:04 ~ 20:44:34,时间达到了30秒。那么我们就从这里开始好了
SELECT sql_exec_id, COUNT(1), to_char(MIN(sample_time), 'yyyymmdd hh24:mi:ss') min_sample_time, to_char(MAX(sample_time), 'yyyymmdd hh24:mi:ss') max_sample_time
FROM dba_hist_active_sess_history
WHERE sql_id='15vru8xqgdkjf'
GROUP BY sql_exec_id
ORDER BY COUNT(1) DESC;
SQL_EXEC_ID COUNT(1) MIN_SAMPLE_TIME MAX_SAMPLE_TIME
------------- --------- ------------------- -------------------
22616995 4 20161120 20:44:04 20161120 20:44:34
......
--可以看到:SQL_ID为'15vru8xqgdkjf'在ASH中被连续采样到了4次,说明执行时间大于30秒.与客户描述完全一致。
6、我们通过sql_exec_id即SQL的唯一执行标记,来看看这笔超长交易的情况
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss') sample_time, instance_number||'_'||session_id sid, event, session_state, p1, p2,
blocking_inst_id||'_'||blocking_session block_sid FROM dba_hist_active_sess_history
WHERE sql_exec_id=22616995
AND sql_id='15vru8xqgdkjf'
ORDER BY sample_time;
SAMPLE_TIME SID EVENT STATE P1 P2 BLOCK_SID
------------------- ------- ------------------- --------- ----- --------- ------------
20161120 20:44:04 1_2724 gc current retry WAITING 43 1152178 _
20161120 20:44:14 1_2724 gc current retry WAITING 43 1152178 _
20161120 20:44:24 1_2724 gc current retry WAITING 43 1152178 _
20161120 20:44:34 1_2724 gc currentretry WAITING 39 1770308 1_2449
确实最长的一笔交易,在04秒到24秒期间,一直在等”gc current retry”。“gc currentretry”,从字面上,看是请求当前BLOCK,但是无法获取,在retry.
既然是向节点2请求一个BLOCK,无法获取,需要retry,那么我们有必要查下节点2是谁负责供应这个BLOCK 呢?没错!就是节点2的LMS进程!
接下来,我们于情于理,都应该去查下问题期间,LMS进程在做什么
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss') sample_time, instance_number||'_'||session_id sid, PROGRAM, event, session_state,state, seq#, p1,p2,blocking_inst_id||'_'||blocking_session block_sid
FROM dba_hist_active_sess_history
WHERE instance_number=2
AND upper(PROGRAM) LIKE '%LMS%'
AND to_char(sample_time, 'yyyymmdd hh24:mi:ss') BETWEEN '20161120 20:44:04' AND '20161120 20:44:24'
ORDER BY sid, sample_time;
SAMPLE_TIME SID PROGRAM EVENT STATE P1 P2 BLOCK_SID
------------------- ------- ----------------------- ------------------- --------- ----- --------- ------------
20161120 20:44:06 2_1035 oracle@corerac2 (LMS9) gcs log flush sysnc WAITING 30 0 2_2633
20161120 20:44:16 2_1035 oracle@corerac2 (LMS9) gcs log flush sysnc WAITING 30 0 2_2633
20161120 20:44:06 2_1082 oracle@corerac2 (LMSA) gcs log flush sysnc WAITING 30 0 2_2633
20161120 20:44:16 2_1082 oracle@corerac2 (LMSA) gcs log flush sysnc WAITING 30 0 2_2633
20161120 20:44:06 2_1129 oracle@corerac2 (LMSB) gcs log flush sysnc WAITING 30 0 2_2633
20161120 20:44:16 2_1129 oracle@corerac2 (LMSB) gcs log flush sysnc WAITING 30 0 2_2633
.....
可以看到,这些LMS进程在等gcs log flush sync,即LMS进程在把一个BLOCK传给其他节点前,需要把这个BLOCK在log buffer中的改变,刷到磁盘中的在线日志文件,
但是很可惜,刷了很长时间都没有完成。所以没有办法快速将BLOCK传输给节点1,因此节点1在不断retry.这里” gcs log flush sync”的阻塞者2_2633显然就是我们所熟悉的LGWR进程,即log buffer 的写进程。
看看节点2的LGWR进程是不是在忙其他事,无暇顾及LMS进程的刷日志请求呢?
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss') sample_time, instance_number||'_'||session_id sid, PROGRAM, event, session_state,state, seq#, p1,p2,blocking_inst_id||'_'||blocking_session block_sid
FROM dba_hist_active_sess_history
WHERE instance_number||'_'||session_id='2_2633'
AND to_char(sample_time, 'yyyymmdd hh24:mi:ss') BETWEEN '20161120 20:44:04' AND '20161120 20:44:24'
ORDER BY sample_time;
SAMPLE_TIME SID PROGRAM EVENT STATE SEQ# P1 P2 BLOCK_SID
------------------- ------- ----------------------- ----------------------- --------- ----- ----------- --------- ------------
20161120 20:44:06 2_2633 oracle@corerac2 (LGWR) enq: CF-contention WAITING 30 1128660997 0 _
20161120 20:44:16 2_2633 oracle@corerac2 (LGWR) enq: CF-contention WAITING 30 1128660997 0 1_1369
20161120 20:44:26 2_2633 oracle@corerac2 (LGWR) enq: CF-contention WAITING 30 1128660997 0 1_1369
20161120 20:44:36 2_2633 oracle@corerac2 (LGWR) LOG file parallel write WAITING 30 1 118 _
可以看到:节点2的LGWR进程,在等enq: CF-contention,即想要去写控制文件(例如切换日志时需要写控制文件),但是被其他人领先一步,这个人是节点1的1369会话。
期间,LGWR进程傻傻的继续等控制文件的队列锁,但等了几十秒都无法获取。
乘胜追击,发出下列查询,看看节点1的1369会话为什么长时间持有控制文件的队列锁。
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss') sample_time, instance_number||'_'||session_id sid, PROGRAM, event, session_state,state, seq#, p1,p2,blocking_inst_id||'_'||blocking_session block_sid
FROM dba_hist_active_sess_history
WHERE instance_number||'_'||session_id='1_1369'
AND to_char(sample_time, 'yyyymmdd hh24:mi:ss') BETWEEN '20161120 20:44:04' AND '20161120 20:44:24'
ORDER BY sample_time;
SAMPLE_TIME SID PROGRAM EVENT STATE SEQ# P1 P2 BLOCK_SID
------------------- ------- ----------------------- ---------------------------- --------- ----- ----------- --------- ------------
20161120 20:44:04 1_1369 oracle@corerac2 (CKPT) control file sequential read WAITING 16094 0 39 _
可以看到,sid=1369是节点1的ckpt检查点进程. ckpt正在进行控制文件的读写,因此持有控制文件的锁。
期间等待事件是控制文件的顺序度和并行写,并且没有阻塞,而是一直在变化。正常来说,如果IO比较快的话,那么持有控制文件的锁的时间是非常快的。
从oswatcher数据可以看到,在问题时段,很多磁盘busy 100%,平均响应时间200多毫秒,IO基本上主要是写.
检查ASH中的等待事件
SELECT to_char(sample_time, 'yyyymmdd hh24:mi:ss'), event, COUNT(1) FROM dba_hist_active_sess_history
WHERE instance_number=1
AND event LIKE '%write%'
GROUP BY to_char(sample_time, 'yyyymmdd hh24:mi:ss')
ORDER BY 1,3;
可以看到是问题时刻,在等db file parallel write的进程有24个,显然是DBWR进程在等该数据库总计24个DBWR进程,全部处于活动状态,说明有什么动作触发了大量刷脏块的操作。
检查alert日志,可以看到alter system archive log all(current)的命令,该命令必然会触发DBWR大量刷脏块,导致磁盘在短时间内极忙,导致HBA卡带宽也可能被占满。
答案就是我们大家所熟悉的RMAN备份脚本!
要说明的是,真相和你想的不一样!不是备份产生的IO影响了磁盘繁忙!
因为之前我们看到了,磁盘繁忙100%来自于写,而不是来自于读!RMAN主要是读!而且如果是RMAN的读IO影响磁盘繁忙,那么不可能只影响了30秒左右!
因为RMAN脚本开始执行时的alter system archive log all(current)的命令触发了DBWR大量刷脏块,导致磁盘很忙,而控制文件和数据文件在一起,导致ckpt进程在拿到控制文件队列锁后,
操作很慢,继而阻塞了LGWR进程,LGWR进程于是不响应LMS进程,最后导致了GC等待数十秒,也就是联机交易大量超时。全量检查点完成后,交易恢复正常!
故障原因总结:
- 故障的原因在于批量期间,产生大量脏块
- ckpt进程持有控制文件的队列锁后,开始进行IO操作
- 但期间rman备份调用alter system archive log的命令,从而触发dbwr进程往数据文件大量的写,导致磁盘100% busy.导致控制文件的IO读写无法快速完成。
- 由于日志切换,因此lgwr进程需要获得控制文件的队列锁,而ckpt进程阻塞了lgwr进程,lgwr进程处于enq: CF – contention状态,无法响应LMS进程的日志写情况。LMS进程负责节点间的GC。因此,GC出现问题,INSERT语句出现超时
数据中心运维建议:
Ø 将控制文件和数据文件从底层RAID组就分开,放到其他磁盘组,避免被DBWR写IO和RMAN读IO影响。
Ø Rman备份脚本中的alter system archive log all命令会导致数据库做全量检查点,触发DBWR的大量IO操作,期间拖慢整体性能
Ø 重新评估 alter system archive log all的替代方式
Ø 将RMAN备份和批量时间错开
##gc相关的等待事件
关于gc的简单描述,就是当实例1 发起请求时发现需要请求的块已经在另一个实例,这个时候就需要通过GCS(LMS)通过私网把相应的块传递到实例1,这个过程就是集群的cache fusion。请求的块又分为当前块和CR。
1、gc buffer busy acquire
当session#1(节点1)尝试请求访问远程实例(remote instance) buffer,但是在session#1之前已经有相同实例上(节点1)另外一个session#2请求访问了相同的buffer,并且没有完成,那么session#1等待gc buffer busy acquire。
出现gc buffer busy acquire的原因一般有以下几种原因:
■ 热点块(hot block)
在AWR中Segments by Global Cache Buffer Busy 记录了访问频繁的gc buffer(对比正常时段),解决方法可以根据热点块的类型采取不同的解决方法,比如采取分区表,分区索引,反向index等等。这点与单机数据库中的buffer busy waits类似。
■ 低效SQL语句
低效SQL语句会导致不必要的buffer被请求访问,增加了buffer busy的机会。在AWR中可以找到TOP SQL。解决方法可以优化SQL语句减少buffer访问。这点与单机数据库中的buffer busy waits类似。
■ 数据交叉访问
RAC数据库,同一数据在不同数据库实例上被请求访问。如果应用程序可以实现,那么我们建议不同的应用功能/模块数据分布在不同的数据库实例上被访问,避免同一数据被多个实例交叉访问,可以减少buffer的争用,避免gc等待。
■ Oracle bug
2、gc cr block busy
指实例1和实例2的buffer cache都含有某个block,某个时刻实例1修改了这个block并且没有提交;这个时刻实例2上的会话1读取这个block需要通过undo record 来构造CR块。且构造的过程中必须将改变记入到实例1的redo,这个时候发生的等待就是gc cr block busy等待。
===================================================== 等待事件 ======================================================================
===================================================== 回收高水位 ======================================================================
如何查询高水位:
ANALYZE TABLE test1 COMPUTE STATISTICS;
select
a.owner,
a.table_name,
a.blocks*to_number(p.value)/1024/1024 size_mb,
a.avg_row_len*a.num_rows/(a.blocks*to_number(p.value))*100 pct,
to_char(a.last_analyzed,'yyyy-mm-dd hh24:mi:ss') last_analyzed
from dba_tables a,v$parameter p
where p.name = 'db_block_size'
and a.blocks > 0
and a.avg_row_len*a.num_rows/(a.blocks*to_number(p.value))*100 < 0.7
and a.blocks*to_number(p.value)/1024/1024 >= 10
and a.owner in ('ULTRANMS');
ALTER TABLE SHRINK command took 20 hours for a 150GB table.
The ALTER TABLE MOVE took 10 hours for a 150GB table.
##采用move(move需要两倍的空间)
1. 回收高水位
针对表
alter table scott.emp move;
针对分区
alter table scott.emp move partition P201209;
2. 重建索引
查看本地索引的状态
select index_name,index_owner,partition_name,status from dba_ind_partitions where index_name in ('IDX_WORKLIST_SUBSID','PK_OM_WL_WORKLIST');
重建本地索引(本地索引肯定是分区索引),可以按整表来重建
alter index NGCRM_JM.IDX_WORKLIST_SUBSID rebuild parallel 8;
重建本地索引(本地索引肯定是分区索引),也可以按分区来重建
alter index NGCRM_JM.IDX_WORKLIST_SUBSID rebuild partition P201209 parallel 8;
查看全局索引的状态(主键肯定是全局索引)
select index_name,index_owner,status from dba_partitions where index_name in ('IDX_WORKLIST_SUBSID','PK_OM_WL_WORKLIST');
重建全局索引
alter index NGCRM_JM.IDX_WORKLIST_SUBSID rebuild parallel 8;
3. 统计信息收集
以下是一些move时候的注意点:
1、move前最好逻辑备份待整理的表;
2、对于大表,建议开启并行和nologging
alter table test move nologging parallel 2;
3、整理完毕后重建相关的索引
alter index ind_test rebuild nologging parallel 2;
4、恢复表和索引的并行度、logging
alter table test logging parallel 1;
5、当我们对table进行move操作时,查询v$locked_objects视图可以发现,table上加了exclusive lock。需要在非业务时段使用
##采用shrink 对整表进行回收高水位,若出现资源繁忙,则考虑采用shrink (shrink不需要额外的空间)
oracle在进行shrink的时候会对相应的索引进行维护,以保证index在shrink结束的时候index仍然有效。
alter table ICDWF.WF_TBL_NODEINFO enable row movement;
alter table ICDWF.WF_TBL_NODEINFO shrink space cascade; --加cascade选项会同时整理索引空间
alter table ICDWF.WF_TBL_NODEINFO disable row movement;
segment shrink分为两个阶段:
1、数据重组(compact):通过一系列insert、delete操作,将数据尽量排列在段的前面。在这个过程中需要在表上加RX锁,即只在需要移动的行上加锁。
在这个过程中会产生大量的undo和redo信息
由于涉及到rowid的改变,需要enable row movement.同时要disable基于rowid的trigger.这一过程对业务影响比较小。
2、HWM调整:第二阶段是调整HWM位置,释放空闲数据块。此过程需要在表上加X锁,会造成表上的所有DML语句阻塞。在业务特别繁忙的系统上可能造成比较大的影响。
注意:shrink space语句两个阶段都执行。
shrink space compact只执行第一个阶段。
如果系统业务比较繁忙,可以先执行shrink space compact重组数据,然后在业务不忙的时候再执行shrink space降低HWM释放空闲数据块。
alter table images shrink space compact; -- do not lock table, run online
alter table images shrink space ; -- lock table, run when users aren't
alter table images modify lob (image) (shrink space);
--耗时案例:
OWNER TABLE_NAME COLUMN_NAME SEGMENT_NAME SEGMENT_TYPE ROUND(B.BYTES/1024/1024)
--------------- -------------------- ------------------------------ ------------------------------ ------------------ ------------------------
HR65_APP SM_BUSILOG_DEFAULT TABLE 8297 --表大小
HR65_APP SM_BUSILOG_DEFAULT LOGMSG SYS_LOB0000085979C00006$$ LOBSEGMENT 299871 --LOB字段大小
HR65_APP SM_BUSILOG_DEFAULT LOGMSG SYS_IL0000085979C00006$$ LOBINDEX 593 --LOB索引大小
TABLE_NAME NUM_ROWS ACTUAL_MB HIGH_WATER_MB NEED_RECOVERY_MB
------------------------------ ---------- ---------- ------------- ----------------
SM_BUSILOG_DEFAULT 1838803 2700.57356 8291.44531 5590.87175
alter table HR65_APP.SM_BUSILOG_DEFAULT shrink space compact;
测试库耗时Elapsed: 02:24:12.95
执行完之后表大小,LOB字段大小都没有变化
ALTER TABLE HR65_APP.SM_BUSILOG_DEFAULT MODIFY LOB (LOGMSG) (SHRINK SPACE compact);
测试库耗时Elapsed: 08:02:45.11
执行完之后LOB字段大小没有变化
ALTER TABLE HR65_APP.SM_BUSILOG_DEFAULT SHRINK SPACE CASCADE;
Elapsed: 01:52:45.64
ALTER TABLE HR65_APP.SM_BUSILOG_DEFAULT MODIFY LOB (LOGMSG) (SHRINK SPACE);
Elapsed: 00:16:43.50
OWNER TABLE_NAME COLUMN_NAME SEGMENT_NAME SEGMENT_TYPE ROUND(B.BYTES/1024/1024)
--------------- -------------------- ------------------------------ ------------------------------ ------------------ ------------------------
HR65_APP SM_BUSILOG_DEFAULT TABLE 3430
HR65_APP SM_BUSILOG_DEFAULT LOGMSG SYS_LOB0000085979C00006$$ LOBSEGMENT 30695
HR65_APP SM_BUSILOG_DEFAULT LOGMSG SYS_IL0000085979C00006$$ LOBINDEX 593
--alter table move和alter table shrink都可以把表收缩和整理碎片,还有一个很大的区别是:
alter table move后表在表空间中的位置肯定会变,可能前移也可能后移,一般来说如果该表前面的表空间中有足够空间容纳该表,则前移,否则后移。
alter table shrink后表在表空间中的位置肯定不变,也就是表的段头位置不会发生变化。
基于这个区别,把一个表空间中的所有表move后,表会都集中到表空间的前半部分了,此时表空间可以收缩。把一个表空间中的所有表shrink后,表空间中表的位置没有发生变化,此时表空间跟shrink之前一样,依然没有多少收缩空间。
#查看高水位下块使用情况
o Allocated, but currently unformatted and unused
o Formatted and contain data
o Formatted and empty because the data was deleted
对于ASSM:
对于ASSM的segment来说,考察HWM下的blocks的空间使用状况相对要简单一些。在这里,我们可以使用这样一个procedure来得到table的blocks使用情况:
create or replace procedure show_space_assm(
p_segname in varchar2,
p_owner in varchar2 default user,
p_type in varchar2 default 'TABLE' )
as
l_fs1_bytes number;
l_fs2_bytes number;
l_fs3_bytes number;
l_fs4_bytes number;
l_fs1_blocks number;
l_fs2_blocks number;
l_fs3_blocks number;
l_fs4_blocks number;
l_full_bytes number;
l_full_blocks number;
l_unformatted_bytes number;
l_unformatted_blocks number;
procedure p( p_label in varchar2, p_num in number )
is
begin
dbms_output.put_line( rpad(p_label,40,'.') ||p_num );
end;
begin
dbms_space.space_usage(
segment_owner => p_owner,
segment_name => p_segname,
segment_type => p_type,
fs1_bytes => l_fs1_bytes,
fs1_blocks => l_fs1_blocks,
fs2_bytes => l_fs2_bytes,
fs2_blocks => l_fs2_blocks,
fs3_bytes => l_fs3_bytes,
fs3_blocks => l_fs3_blocks,
fs4_bytes => l_fs4_bytes,
fs4_blocks => l_fs4_blocks,
full_bytes => l_full_bytes,
full_blocks => l_full_blocks,
unformatted_blocks => l_unformatted_blocks,
unformatted_bytes => l_unformatted_bytes);
p('free space 0-25% Blocks:',l_fs1_blocks);
p('free space 25-50% Blocks:',l_fs2_blocks);
p('free space 50-75% Blocks:',l_fs3_blocks);
p('free space 75-100% Blocks:',l_fs4_blocks);
p('Full Blocks:',l_full_blocks);
p('Unformatted blocks:',l_unformatted_blocks);
end;
/
我们知道,在ASSM下,block的空间使用分为free space: 0-25%,25-50%,50-75%,70-100%,full 这样5中情况,show_space_assm会对需要统计的table汇总这5中类型的block的数量。
我们来看table HWM1的空间使用情况:
SQL> set serveroutput on;
SQL> exec show_space_assm('HWM1','DLINGER');
free space 0-25% Blocks:.................0
free space 25-50% Blocks:...............1
free space 50-75% Blocks:...............0
free space 75-100% Blocks:..............8
Full Blocks:.....................................417
Unformatted blocks:.........................0
这个结果显示,table HWM1,full的block有417个,free space 为75-100% Block有8个,free space 25-50% Block有1个。当table HWM下的blocks的状态大多为free space
较高的值时,我们考虑来合并HWM下的blocks,将空闲的block释放,降低table的HWM。
SQL> set serveroutput on;
SQL> exec show_space_assm('SM_BUSILOG_DEFAULT','HR65_APP');
SQL> exec show_space_assm('SYS_LOB0000085979C00006$$','HR65_APP','LOB');
#查看高水位存储过程
create or replace procedure show_space
( p_segname in varchar2,
p_owner in varchar2 default user,
p_type in varchar2 default 'TABLE',
p_partition in varchar2 default NULL )
as
l_total_blocks number;
l_total_bytes number;
l_unused_blocks number;
l_unused_bytes number;
l_LastUsedExtFileId number;
l_LastUsedExtBlockId number;
l_last_used_block number;
procedure p( p_label in varchar2, p_num in number )
is
begin
dbms_output.put_line( rpad(p_label,40,'.') ||
p_num );
end;
begin
dbms_space.unused_space
( segment_owner => p_owner,
segment_name => p_segname,
segment_type => p_type,
partition_name => p_partition,
total_blocks => l_total_blocks,
total_bytes => l_total_bytes,
unused_blocks => l_unused_blocks,
unused_bytes => l_unused_bytes,
last_used_extent_file_id => l_LastUsedExtFileId,
last_used_extent_block_id => l_LastUsedExtBlockId,
last_used_block => l_last_used_block );
p( 'Total Blocks', l_total_blocks );
p( 'Total Bytes', l_total_bytes );
p( 'Unused Blocks', l_unused_blocks );
p( 'Unused Bytes', l_unused_bytes );
p( 'Last Used Ext FileId', l_LastUsedExtFileId );
p( 'Last Used Ext BlockId', l_LastUsedExtBlockId );
p( 'Last Used Block', l_last_used_block );
end;
===================================================== 回收高水位 ====================================================================
##表碎片(高水位)
SELECT --表
TABLE_NAME,
OWNER,
PARTITIONED,
tablespace_name,
LAST_ANALYZED,
NUM_ROWS,
BLOCKS*8/1024 "SIZE_M(true)", --表的总大小
(AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/1024/1024 "USED_M(need)", --行平均长度*行数 + 事物槽 (实际使用大小)
(BLOCKS*8/1024 - ((AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/1024/1024)) "FREE_MB(release)", --可以释放大小
(1-(AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/8096/BLOCKS)*100 "PCT(shrink)" --释放百分比
FROM DBA_TABLES
where BLOCKS*8/1024 >5 and blocks != 0
and (1-(AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/8096/BLOCKS)*100 > 20 --释放百分比大于20%以上
order by "FREE_MB(release)", "SIZE_M(true)", "PCT(shrink)" asc;
SELECT --有lob字段的表
t.TABLE_NAME,
t.OWNER,
LAST_ANALYZED,
b.segment_name,
b.lob_size_m,
NUM_ROWS,
BLOCKS*8/1024 "SIZE_M(true)",
(AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/1024/1024 "USED_M(need)",
(BLOCKS*8/1024 - ((AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/1024/1024)) "FREE_MB(release)",
(1-(AVG_ROW_LEN*NUM_ROWS+24*INI_TRANS)/8096/BLOCKS)*100 "PCT(shrink)"
FROM DBA_TABLES t,
(select * from (
select s.owner, l.table_name, s.segment_name , l.COLUMN_NAME, s.bytes/1024/1024 lob_size_m from dba_segments s, dba_lobs l
where s.segment_name=l.segment_name and segment_type='LOBSEGMENT' order by 5 desc )
where rownum<10) b
where t.OWNER=b.OWNER and t.TABLE_NAME=b.TABLE_NAME and t.BLOCKS*8/1024 >5 and blocks != 0
order by "FREE_MB(release)", "SIZE_M(true)", "PCT(shrink)" asc;
##监控表空间的增长
--每天增长量
select a.days, a.tsname, cur_size_mb,used_size_mb,usage_pert, used_size_mb - lag (used_size_mb,1) over ( partition by a.tsname order by a.tsname,a.days) inc_used_size_mb
from (
select to_char(sp.begin_interval_time,'mm-dd-yyyy') days, ts.tsname,
max(round((tsu.tablespace_size* dt.block_size )/(1024*1024),2) ) cur_size_mb,
max(round((tsu.tablespace_usedsize* dt.block_size )/(1024*1024),2)) used_size_mb,
round(max(tsu.tablespace_usedsize)/max(tsu.tablespace_size)*100, 1) usage_pert
from dba_hist_tbspc_space_usage tsu --依赖AWR SNAPSHOT数据
inner join dba_hist_tablespace_stat ts on tsu.tablespace_id=ts.ts#
inner join dba_hist_snapshot sp on tsu.snap_id =sp.snap_id
inner join dba_tablespaces dt on ts.tsname=dt.tablespace_name
where
--(sp.begin_interval_time > sysdate-2) and
dt.tablespace_name not like '%UNDO%'
group by to_char(sp.begin_interval_time,'mm-dd-yyyy'), ts.tsname
order by ts.tsname, days
) a ;
--平均增长量/天
break on report
compute sum of cur_used_size_mb on report
compute sum of avg_increas_mb on report
select b.tsname tablespace_name, max(b.used_size_mb) cur_used_size_mb, round(avg(inc_used_size_mb),2) avg_increas_mb
from (
select a.days, a.tsname, used_size_mb, used_size_mb - lag (used_size_mb,1) over ( partition by a.tsname order by a.tsname,a.days) inc_used_size_mb
from (
select to_char(sp.begin_interval_time,'mm-dd-yyyy') days, ts.tsname,
max(round((tsu.tablespace_usedsize* dt.block_size )/(1024*1024),2)) used_size_mb
from dba_hist_tbspc_space_usage tsu
inner join dba_hist_tablespace_stat ts on tsu.tablespace_id=ts.ts#
inner join dba_hist_snapshot sp on tsu.snap_id =sp.snap_id
inner join dba_tablespaces dt on ts.tsname=dt.tablespace_name
where
--(sp.begin_interval_time > sysdate-2) and
dt.tablespace_name not like '%UNDO%'
group by to_char(sp.begin_interval_time,'mm-dd-yyyy'), ts.tsname
order by ts.tsname, days
) a
) b
group by b.tsname order by b.tsname;
select sum(space_used_total)/1024/1024/1024 "last 7 days db increase - G"
from
dba_hist_seg_stat s,
dba_hist_seg_stat_obj o,
dba_hist_snapshot sn
where
s.obj# = o.obj#
and
sn.snap_id = s.snap_id
and begin_interval_time > sysdate-8
order by
begin_interval_time
/
##每天产生的归档量
alter session set NLS_DATE_FORMAT='YYYY-MON-DD';
select trunc(completion_time) rundate
,count(*) logswitch
,round((sum(blocks*block_size)/1024/1024)) "REDO PER DAY (MB)"
from v$archived_log
group by trunc(completion_time)
order by 1;
每小时切换次数
select trunc(first_time, 'HH') , count(*)
from v$loghist
group by trunc(first_time, 'HH')
order by trunc(first_time, 'HH');
set line 300 pagesize 1000
col cmd for a100
select /*+ ordered use_hash(a,c) */ 'alter database datafile '''||a.file_name||''' resize ' ||round(a.filesize - (a.filesize - c.hwmsize-100) *0.8)||'M;' cmd, a.filesize, c.hwmsize, a.filesize-round(a.filesize - (a.filesize - c.hwmsize-100) *0.8) shrink_size
from (select file_id,file_name,round(bytes/1024/1024) filesize from dba_data_files) a,
(select file_id,round(max(block_id)*8/1024) HWMsize from dba_extents group by file_id) c
where a.file_id = c.file_id and a.filesize - c.hwmsize > 100 order by shrink_size;
#临时表空间的使用情况(高水位 rac)
SELECT d.tablespace_name "Tablespace_name",
TO_CHAR(NVL(a.bytes / 1024 / 1024, 0),'99,999,990.900') "Size (M)",
TO_CHAR(NVL(t.hwm, 0)/1024/1024,'99999999.999') "HWM (M)",
TO_CHAR(NVL(t.hwm / a.bytes * 100, 0), '990.00') "HWM % " ,
TO_CHAR(NVL(t.bytes/1024/1024, 0),'99999999.999') "Using (M)",
TO_CHAR(NVL(t.bytes / a.bytes * 100, 0), '990.00') "Using %"
FROM sys.dba_tablespaces d,
(select tablespace_name, sum(bytes) bytes from dba_temp_files group by tablespace_name) a,
(select tablespace_name, sum(bytes_cached) hwm, sum(bytes_used) bytes from gv$temp_extent_pool group by tablespace_name) t
WHERE d.tablespace_name = a.tablespace_name(+)
AND d.tablespace_name = t.tablespace_name(+)
AND d.extent_management like 'LOCAL'
AND d.contents like 'TEMPORARY';
--收缩临时表空间
alter tablespace temp shrink space keep 400M;
alter tablespace temp shrink space;
alter tablespace temp shrink tempfile '/u01/app/oracle/oradata/anqing/temp01.dbf' keep 300M;
alter tablespace temp shrink tempfile '/u01/app/oracle/oradata/anqing/temp01.dbf';
##数据文件碎片
select /*+ ordered use_hash(a,b,c) */ a.file_id,a.file_name,a.filesize, b.freesize,
(a.filesize-b.freesize) usedsize, c.hwmsize, c.hwmsize - (a.filesize-b.freesize) freesize_belowhwm, a.filesize - c.hwmsize canshrinksize
from (select file_id,file_name,round(bytes/1024/1024) filesize from dba_data_files) a,
(select file_id,round(sum(dfs.bytes)/1024/1024) freesize from dba_free_space dfs group by file_id) b,
(select file_id,round(max(block_id)*8/1024) HWMsize from dba_extents group by file_id) c
where a.file_id = b.file_id and a.file_id = c.file_id
order by freesize_belowhwm desc;
#表空间碎片
select a.tablespace_name,
trunc(sqrt(max(blocks)/sum(blocks))* (100/sqrt(sqrt(count(blocks)))),2) fsfi
from dba_free_space a,dba_tablespaces b
where a.tablespace_name=b.tablespace_name
and b.contents not in('TEMPORARY','UNDO')
group by A.tablespace_name
order by fsfi;