230 while (1) { 231 TS_RECORD(TS_H1,stamp_offset + 2000); 232 if (m_request==0) { 233 DEBUG_TEXT(DFDB_QUEUE, 8, "RequestHandler(" << m_handlerId <<")::run: popping request Q"); 234 m_request=m_requestQueue->pop(); 235 } 236 else { 237 DEBUG_TEXT(DFDB_QUEUE, 8, "RequestHandler(" << m_handlerId <<")::run: repeating oldRequest"); 238 } 239 DEBUG_TEXT(DFDB_QUEUE, 8, "RequestHandler(" << m_handlerId <<")::run: request Q popped OK"); 240 241 int requestOk=-1; // 将requestOk赋值拿到try块外,通过在程序块内打印requestOk的值,推断程序块的执行情况。 242 try { 243 TS_RECORD(TS_H1,stamp_offset + 2100); 244 //int requestOk; 245 //unsigned int tsdata = m_handlerId * 100 + 3000000000; 246 //TS_RECORD(TS_H5,tsdata); 247 248 requestOk = m_request->execute(); 249 TS_RECORD(TS_H1,stamp_offset + 2200); 250 251 if (requestOk == Request::REQUEST_OK) { 252 m_requestsHandled++; 253 deleteRequest(); 254 DEBUG_TEXT(DFDB_ROSCORE, 8, "RequestHandler(" << m_handlerId <<")::run requestOk"); 255 TS_RECORD(TS_H1, stamp_offset + 2300); 256 } 257 else if (requestOk == Request::REQUEST_TIMEOUT) { 258 m_requestsTimedOut++; 259 deleteRequest(); 260 TS_RECORD(TS_H1, stamp_offset + 2300); 261 } 262 else { 263 TS_RECORD(TS_H1, stamp_offset + 2400); 264 DEBUG_TEXT(DFDB_ROSCORE, 8, "RequestHandler(" << m_handlerId <<")::run: Request non completed: will be requeued."); 265 m_requestsFailed++; 266 m_request = m_requestQueue->swap(m_request); 267 DFThread::yieldOrCancel(); 268 TS_RECORD(TS_H1, stamp_offset + 2500); 269 } 270 271 TS_RECORD(TS_H1, stamp_offset + 2990); 272 } 273 catch (std::exception& issue) { 274 ENCAPSULATE_ROS_EXCEPTION(newIssue, CoreException, UNCLASSIFIED, issue, "RequestHandler caught exception, deleting Request "); 275 DEBUG_TEXT(DFDB_ROSCORE, 8, "requestOk: " << requestOk<< std::endl); 276 ers::error(newIssue); 277 deleteRequest(); 278 } 279 }
查看log文件ROS-Eth-02_cmm03node01_1487247826.out:
3006978 Debug(12,140507831203584): RequestHandler(1)::run: popping request Q 3006979 Debug(7,140507829102336): DcTriggerIn::processDataRequest() EB message (xid 131047) 3006980 Debug(7,140507829102336): new FragmentRequest level1Id:131047 3006981 3006982 Debug(12,140507831203584): RequestHandler(1)::run: request Q popped OK 3006983 Debug(13,140507831203584): FragmentRequest::execute for L1Id 131047 3006984 3006985 Debug(13,140507831203584): FragmentRequest::execute calling builder->createFragment for L1Id 131047 3006986 3006987 Debug(13,140507831203584): FragmentRequest::execute calling main sendData for L1Id 131047 3006988 3006989 Debug(7,140507829102336): DcTriggerIn::processDataRequest() EB message (xid 131076) 3006990 Debug(7,140507829102336): new FragmentRequest level1Id:131076 3006991 3006992 Debug(7,140507829102336): DcTriggerIn::processDataRequest() EB message (xid 131052) 3006993 Debug(7,140507829102336): new FragmentRequest level1Id:131052 3006994 3006995 Debug(13,140507831203584): FragmentRequest::execute finished with L1Id 131047fragmentOk=1 3006996 3006997 Debug(13,140507831203584): FragmentRequest::~FragmentRequest 3006998 3006999 Debug(13,140507831203584): RequestHandler(1)::run requestOk 3007000 Debug(12,140507831203584): RequestHandler(1)::run: popping request Q 3007001 Debug(12,140507831203584): RequestHandler(1)::run: request Q popped OK 3007002 Debug(13,140507831203584): ReleaseRequest::execute started 3007003 3007004 Debug(13,140507831203584): ReleaseRequest::~ReleaseRequest 3007005 3007006 Debug(13,140507831203584): RequestHandler(1)::run requestOk 3007007 Debug(12,140507831203584): RequestHandler(1)::run: popping request Q 3007008 Debug(12,140507831203584): RequestHandler(1)::run: request Q popped OK 3007009 Debug(13,140507831203584): FragmentRequest::execute for L1Id 131076 3007010 3007011 Debug(13,140507831203584): FragmentRequest::execute calling builder->createFragment for L1Id 131076 3007012 3007013 Debug(13,140507831203584): requestOk: -1 //说明try..catch块抓到异常requestOk的值为-1,说明Request的execute执行没有成功完成 3007014 3007015 Debug(13,140507831203584): FragmentRequest::~FragmentRequest 3007016 3007017 Debug(12,140507831203584): RequestHandler(1)::run: popping request Q 3007018 Debug(12,140507831203584): RequestHandler(1)::run: request Q popped OK 3007019 Debug(13,140507831203584): FragmentRequest::execute for L1Id 131052 3007020 3007021 Debug(7,140507829102336): DcTriggerIn::processDataRequest() EB message (xid 131049) 3007022 Debug(7,140507829102336): new FragmentRequest level1Id:131049 3007023 3007024 Debug(7,140507829102336): DcTriggerIn::processDataRequest() EB message (xid 131077) 3007025 Debug(7,140507829102336): new FragmentRequest level1Id:131077
由于requestOk的值为-1,得出,Request的execute函数没有正确返回,因为正确返回的结果为0,1或者2。
在log文件里查找L1id为131076的FragmentRequest在execute函数执行的最后一句话有没有打印:
[lhaaso@cmm03node01 part_dk_ef]$ grep "FragmentRequest::execute finished with L1Id 131076fragmentOk=1" ROS-Eth-02_cmm03node01_1487247826.out [lhaaso@cmm03node01 part_dk_ef]$
[lhaaso@cmm03node01 part_dk_ef]$ grep "FragmentRequest::execute calling main sendData for L1Id 131076" ROS-Eth-02_cmm03node01_1487247826.out
[lhaaso@cmm03node01 part_dk_ef]$
没有找到,说明是FragmentRequest的execute()没有正常执行,并且FragmentRequest::execute()函数在”FragmentRequest::execute calling main sendData for L1Id”打印出来之前就已经出现问题,后面没有接着执行发送131076的数据片段给SFI,这也导致了SFI报的warning:
[lhaaso@cmm03node13 part_dk_ef]$ grep -n "131076" SFI*.err SFI-4_cmm03node13_1487247819.err:159:WARNING 2017-Feb-16 20:31:36 [DC::StatusWord EventAssembly::EventCompleted(...) at SFI/src/EventAssembly.cxx:480] Problem with the flow of data: Event with LVL1ID 131076 misses 2 data fragment(s) from: ROS-Eth-01, ROS-Eth-02,
程序出错的范围为第117行~第170行。
107 int FragmentRequest::execute(void) 108 { 109 DEBUG_TEXT(DFDB_ROSCORE, 8, "FragmentRequest::execute for L1Id "<< m_level1Id<< std::endl); //已被打印 110 //This code is to record the last runs through this function with 111 //the ring buffer mode of the rcc_time_stamp package 112 //unsigned int tsdata = (m_level1Id * 100) + 2000000000; 113 //TS_RECORD(TS_H5,(tsdata+1)); 114 115 // Ask the builder to create us an empty fragment. 116 s_mutex->lock() ; 117 DEBUG_TEXT(DFDB_ROSCORE, 8, "FragmentRequest::execute calling builder->createFragment for L1Id "<< m_level1Id<< std::endl);//已被打印 118 m_eventFragment=m_builder->createFragment(m_level1Id, m_dataChannels->size()); 119 s_mutex->unlock() ; 120 121 122 std::vector<DataChannel*>::iterator chanStartIter=m_dataChannels->begin(); 123 std::vector<DataChannel*>::iterator chanEndIter=m_dataChannels->end(); 124 // Pre-request data from all DataChannels involved 125 int index=0; 126 for (std::vector<DataChannel*>::iterator dc=chanStartIter; dc!=chanEndIter; dc++) { 127 m_ticket[index++] = (*dc)->requestFragment(m_level1Id); 128 } 129 130 bool fragmentOk=true; 131 index=0; 132 int partsReceived=0; 133 for (std::vector<DataChannel*>::iterator dc=chanStartIter; dc!=chanEndIter; dc++) { 134 DEBUG_TEXT(DFDB_ROSCORE, 20, "FragmentRequest::execute calling dc->getFragment for L1Id "<< m_level1Id<< std::endl); 135 EventFragment* subFragment = ((*dc)->getFragment(m_ticket[index])) ; 136 if (subFragment != 0) { 137 partsReceived++; 138 DEBUG_TEXT(DFDB_ROSCORE, 20, "FragmentRequest::execute calling builder->appendFragment for L1Id "<< m_level1Id<< std::endl ); 139 m_builder->appendFragment(m_eventFragment,subFragment); 140 TS_RECORD(TS_H1,2350); 141 142 fragmentOk=subFragment->fragmentReady(); 143 144 s_mutex->lock() ; 145 delete subFragment; 146 s_mutex->unlock() ; 147 } 148 else { 149 DEBUG_TEXT(DFDB_ROSCORE, 15, "FragmentRequest for L1Id "<< m_level1Id << " missing data aborting "); 150 fragmentOk=false; 151 TS_RECORD(TS_H1,2360); 152 } 153 154 index++; 155 } 156 157 158 bool retired=false; 159 if (!fragmentOk) { 160 retired=checkAge(s_maxAge); 161 } 162 163 if (fragmentOk || (retired && partsReceived>0)) { 164 165 if (retired) { 166 // Issue warning message 167 CREATE_ROS_EXCEPTION(tIssue,CoreException,CoreException::TIMEOUT, 168 "in request for fragment with L1 ID "<<m_level1Id); 169 ers::warning(tIssue); 170 } 171 DEBUG_TEXT(DFDB_ROSCORE, 8, "FragmentRequest::execute calling main sendData for L1Id "<< m_level1Id<< std::endl);//未被打印