【NE现场】
pid: 3560, tid: 6404, name: MediaScannerSer >>> android.process.media <<< signal 7 (SIGBUS), code 1 (BUS_ADRALN), fault addr 0x9f5d9dd6 r0 9f5d9dd6 r1 0000000c r2 0000829a r3 0000000b r4 9f5d9dd6 r5 0000829a r6 ae94fec8 r7 9f5d9c08 r8 00000008 r9 0000136c sl ae950268 fp 0000000c ip ae94ff30 sp a00178e8 lr ae946ad9 pc ae9463b6 cpsr 60010030 backtrace: #00 pc 000023b6 /system/lib/libjhead.so (ConvertAnyFormat+133) #01 pc 00002ad5 /system/lib/libjhead.so #02 pc 00002dbd /system/lib/libjhead.so #03 pc 00003163 /system/lib/libjhead.so (process_EXIF+258) #04 pc 00004eef /system/lib/libjhead.so (ReadJpegSections+638) #05 pc 000054a1 /system/lib/libjhead.so (ReadJpegFile+68) #06 pc 00001069 /system/lib/libjhead_jni.so #07 pc 000012a5 /system/lib/libjhead_jni.so #08 pc 726ea305 /data/dalvik-cache/arm/system@framework@boot.oat (offset 0x23a6000)
错误的类型是地址访问未对齐导致的。
【问题分析】
用addr2line反解出的调用栈如下:
#00 ConvertAnyFormat@external/jhead/exif.c:456 #01 ProcessExifDir@external/jhead/exif.c:849 #02 ProcessExifDir@external/jhead/exif.c:975 #03 process_EXIF@external/jhead/exif.c:1133 #04 ReadJpegSections@external/jhead/jpgfile.c:259 #05 ReadJpegFile@external/jhead/jpgfile.c:539 #06 loadExifInfo@external/jhead/main.c:105 #07 getAttributes@external/jhead/main.c:569 #08 pc 726ea305 /data/dalvik-cache/arm/system@framework@boot.oat (offset 0x23a6000)
看起来是java层调用jni函数getAttributes的时候挂掉的。
NE的点:
double ConvertAnyFormat(void * ValuePtr, int Format) { double Value; Value = 0; switch(Format){ case FMT_SBYTE: Value = *(signed char *)ValuePtr; break; case FMT_BYTE: Value = *(uchar *)ValuePtr; break; case FMT_USHORT: Value = Get16u(ValuePtr); break; case FMT_ULONG: Value = Get32u(ValuePtr); break; case FMT_URATIONAL: case FMT_SRATIONAL: { int Num,Den; Num = Get32s(ValuePtr); Den = Get32s(4+(char *)ValuePtr); if (Den == 0){ Value = 0; }else{ Value = (double)Num/Den; } break; } case FMT_SSHORT: Value = (signed short)Get16u(ValuePtr); break; case FMT_SLONG: Value = Get32s(ValuePtr); break; // Not sure if this is correct (never seen float used in Exif format) case FMT_SINGLE: Value = (double)*(float *)ValuePtr; break; case FMT_DOUBLE: Value = *(double *)ValuePtr; break; // <<<<<<<<<这条语句NE default: ErrNonfatal("Illegal format code %d",Format,0); } return Value; }
这里的注释有意思,说这种格式正常来说不应该走的。只是因为规范里写了,所以作者加了对应的case,说明这段code可能不靠谱。
接着看它的上一级函数:
static void ProcessExifDir(unsigned char * DirStart, unsigned char * OffsetBase, unsigned ExifLength, int NestingLevel) { ... for (de=0;de<NumDirEntries;de++){ int Tag, Format, Components; unsigned char * ValuePtr; int ByteCount; unsigned char * DirEntry; DirEntry = DIR_ENTRY_ADDR(DirStart, de); Tag = Get16u(DirEntry); Format = Get16u(DirEntry+2); Components = Get32u(DirEntry+4); ... ByteCount = Components * BytesPerFormat[Format]; if (ByteCount > 4){ unsigned OffsetVal; OffsetVal = Get32u(DirEntry+8); ValuePtr = OffsetBase+OffsetVal; ... } ... // Extract useful components of tag switch(Tag){ ... case TAG_EXPOSURETIME: // Simplest way of expressing exposure time, so I trust it most. // (overwrite previously computd value if there is one) ImageInfo.ExposureTime = (float)ConvertAnyFormat(ValuePtr, Format); <<<<<<<<<< break; ...
ExposureTime意识是曝光长度,这个是相机里的专业术语,那可能是照相机拍出来的图片。
而tagtable上写着TAG_EXPOSURETIME对应的数据格式是FMT_SRATIONAL,也就是有理数。
static const TagTable_t TagTable[] = { ... { TAG_CFA_PATTERN1, "CFAPattern", 0, 0}, { TAG_BATTERY_LEVEL, "BatteryLevel", 0, 0}, { TAG_COPYRIGHT, "Copyright", FMT_STRING, -1}, { TAG_EXPOSURETIME, "ExposureTime", FMT_SRATIONAL, 1}, { TAG_FNUMBER, "FNumber", FMT_SRATIONAL, 1}, { TAG_IPTC_NAA, "IPTC/NAA", 0, 0}, ...
有理数是什么意思呢? 了解相机术语的同学应该知道,曝光长度的单位是秒,一般是1/48秒、1/2000秒等。
这种就是有理数了,上面ConvertAnyFormat()中对FMT_SRATIONAL的处理也能看出来。
这种数据应该是8字节的也就是2个32位数,前一个32位数表示分子,后一个32位数表示分母。
从规范中也能找到这部分的定义http://www.media.mit.edu/pia/Research/deepview/exif.html。
也就是说,代码是符合规范的,但实际的图片格式不符合规范。
那拿来这么多异常图片呢?为此,需要弄到具体的图片。
从tombstone的near sl数据中,看到如下内容:
memory near sl: ae950248 00000003 00000000 00000000 00000000 ................ ae950258 00000001 00000000 00000001 00000000 ................ ae950268 00000000 00000000 00000001 00000005 ................ ae950278 00000000 b4ee22a0 00000000 6f74732f ....."....../sto ae950288 65676172 756d652f 6574616c 2f302f64 rage/emulated/0/ ae950298 636e6574 2f746e65 69665151 725f656c tencent/QQfile_r ae9502a8 2f766365 706a2e34 00000067 00000000 ecv/4.jpg....... ae9502b8 00000000 00000000 00000000 00000000 ................ ae9502c8 00000000 00000000 00000000 00000000 ................ ae9502d8 00000000 00000000 00000000 00000000 ................ ae9502e8 00000000 00000000 00000000 00000000 ................ ae9502f8 00000000 00000000 00000000 00000000 ................ ae950308 00000000 00000000 00000000 00000000 ................ ae950318 00000000 00000000 00000000 00000000 ................ ae950328 00000000 00000000 00000000 00000000 ................ ae950338 00000000 00000000 00000000 00000000 ................
里面有个路径是jpg图片,该路经为/storage/emulated/0/tencent/QQfile_recv/4.jpg,地址为0xae950284。
从map表可以看到这块地址是libjhead.so的数据段。
ae944000-ae94efff r-x 0 b000 /system/lib/libjhead.so (BuildId: e8fd847cc1d1d9f8d8df80fbcbd899b1) ae94f000-ae94ffff r-- a000 1000 /system/lib/libjhead.so ae950000-ae950fff rw- b000 1000 /system/lib/libjhead.so
那这个数据应该是静态变量数据了。它的模块内偏移地址为ae944000
0xae950284 - 0xae944000 = 0xc284
用readelf看看这个是哪个变量:
$ readelf -s libjhead.so |grep c284 95: 0000c284 6776 OBJECT GLOBAL DEFAULT 22 ImageInfo
可知,这个变量就是ImageInfo,代码中定义如下:
typedef struct { char FileName [PATH_MAX+1]; time_t FileDateTime; unsigned FileSize; char CameraMake [32]; ... char GpsDateStamp[11]; char GpsTimeStamp[11]; char GpsProcessingMethod[GPS_PROCESSING_METHOD_LEN + 1]; }ImageInfo_t; ImageInfo_t ImageInfo;
ImageInfo的第一个成员刚好是FileName。那这个FileName是什么时候赋值的呢?
回溯调用栈,在#6层发现了赋值的代码。
static int loadExifInfo(const char* FileName, int readJPG) { ... strncpy(ImageInfo.FileName, FileName, PATH_MAX); <<<<<<<<<<<<<< #ifdef SUPERDEBUG ALOGE("ReadJpegFile"); #endif return ReadJpegFile(FileName, ReadMode); }
因此可以确定/storage/emulated/0/tencent/QQfile_recv/4.jpg就是格式错误的图片。
而其他10几个tombstone中这个file name都指向/storage/emulated/0/tencent/QQfile_recv/,只是文件名不同。
QQ收发文件时对文件进行了改写?
带着疑问跟测试同事沟通,测试同事反馈知乎上最近一段时间内有QQ引起的Crash问题。
链接如下:
https://www.zhihu.com/question/43322214/answer/95853794,文中描述的正是我们遇到的这个问题。
搞来出问题的图片,push到我们多个机型后问题必现。
用二进制编辑器发现该jpg格式确实有问题:
FF D8 FF E0 00 10 4A 46 49 46 00 01 01 00 00 01 00 01 00 00 FF E1 00 B1 45 78 69 66 00 00 4D 4D 00 2A 00 00 00 08 00 05 01 10 00 02 00 00 00 08 00 00 00 4A 87 69 00 09 00 00 00 01 00 00 00 5B 01 01 00 09 00 00 00 01 00 00 10 40 01 00 00 09 00 00 00 01 00 00 0C 30 01 0F 00 02 00 00 00 09 00 00 00 52 00 00 00 00 4E 65 78 75 73 20 36 00 6D 6F 74 6F 72 6F 6C 61 00 00 04 88 27 00 09 00 00 00 01 00 00 00 CB 82 9A 00 0C 00 00 00 01 00 -- -- -- -- -- -- -- -- -- 00 00 91 92 0A 00 0A 00 00 00 01 00 00 00 99 82 -- -- -- 9D 00 0C 00 00 00 01 00 00 00 A1 00 00 00 00 3F 99 99 25 3B B4 F3 23 00 00 0E EC 00 00 03 E8 40 ...
数据中0x829a为TAG_EXPOSURETIME:
#define TAG_COPYRIGHT 0x8298 #define TAG_EXPOSURETIME 0x829A #define TAG_FNUMBER 0x829D
0x000c(12)是FMT_DOUBLE
#define FMT_SRATIONAL 10 #define FMT_SINGLE 11 #define FMT_DOUBLE 12
解析代码为:
static void ProcessExifDir(unsigned char * DirStart, unsigned char * OffsetBase, unsigned ExifLength, int NestingLevel) { ... for (de=0;de<NumDirEntries;de++){ int Tag, Format, Components; unsigned char * ValuePtr; int ByteCount; unsigned char * DirEntry; DirEntry = DIR_ENTRY_ADDR(DirStart, de); Tag = Get16u(DirEntry); Format = Get16u(DirEntry+2); Components = Get32u(DirEntry+4); ... ByteCount = Components * BytesPerFormat[Format]; if (ByteCount > 4){ unsigned OffsetVal; OffsetVal = Get32u(DirEntry+8); ValuePtr = OffsetBase+OffsetVal; ... } ... // Extract useful components of tag switch(Tag){ ... case TAG_EXPOSURETIME: // Simplest way of expressing exposure time, so I trust it most. // (overwrite previously computd value if there is one) ImageInfo.ExposureTime = (float)ConvertAnyFormat(ValuePtr, Format); <<<<<<<<<< break; ...
先读tag和format,在读Components,最后读offset
对照数据,Components是0x00000001,offset是0x00000091。
而出问题的ValuePtr = base + offset,因为base是偶数,所以ValuePtr就必定是基数了。
所以才会导致SIGBUS类型的Native Crash。
如果手动把format从0x0c改成0xa,就不会出问题了。
【解决方案】
剩下的问题是如何规避这个问题,知乎上给出的解决方案是:
- case FMT_DOUBLE: Value = *(double *)ValuePtr; break; + case FMT_DOUBLE: memcpy(&Value, ValuePtr, sizeof(double));break;
这种修改方法可以不让应用crash,但数据不会被正确解析,因为手机端数据是小端格式,而图片数据格式可以是小端,也可以是大段。
这种memcpy形式只能考虑到一种情况,而我们实际出问题的那张图片正是大端的,所以转成double后数据可是肯定不对。
其实源代码中已经提供了解决方案,如:
int Get16u(void * Short) { if (MotorolaOrder){ return (((uchar *)Short)[0] << 8) | ((uchar *)Short)[1]; }else{ return (((uchar *)Short)[1] << 8) | ((uchar *)Short)[0]; } }
参考这种方法,做了如下修改:
//-------------------------------------------------------------------------- // Convert a 64 bit signed value from file's native byte order //-------------------------------------------------------------------------- long long Get64s(void * LLong) { uchar * ValuePtr = (uchar *)LLong; if (MotorolaOrder){ return (((long long)ValuePtr[0]) << 56) | (((ullong)ValuePtr[1]) << 48) | ((( ullong)ValuePtr[2]) << 40) | (((ullong)ValuePtr[3]) << 32) | ((( ullong)ValuePtr[4]) << 24) | (((ullong)ValuePtr[5]) << 16) | ((( ullong)ValuePtr[6]) << 8 ) | (((ullong)ValuePtr[7]) << 0 ); }else{ return (((long long)ValuePtr[7]) << 56) | (((ullong)ValuePtr[6]) << 48) | ((( ullong)ValuePtr[5]) << 40) | (((ullong)ValuePtr[4]) << 32) | ((( ullong)ValuePtr[3]) << 24) | (((ullong)ValuePtr[2]) << 16) | ((( ullong)ValuePtr[1]) << 8 ) | (((ullong)ValuePtr[0]) << 0 ); } } //-------------------------------------------------------------------------- // Convert a 64 bit double value from signed long long value //-------------------------------------------------------------------------- double Get64d(void * Double) { union { double ret; long long var; }data; data.var = Get64s(Double); return data.ret; } //-------------------------------------------------------------------------- // Convert a 32 bit float value from signed int value //-------------------------------------------------------------------------- float Get32f(void * Float) { union { float ret; int var; }data; data.var = Get32s(Float); return data.ret; }
...
void PrintFormatNumber(void * ValuePtr, int Format, int ByteCount)
{
int s,n;
for(n=0;n<16;n++){
switch(Format){
case FMT_SBYTE:
case FMT_BYTE: printf("%02x",*(uchar *)ValuePtr); s=1; break;
case FMT_USHORT: printf("%d",Get16u(ValuePtr)); s=2; break;
case FMT_ULONG:
case FMT_SLONG: printf("%d",Get32s(ValuePtr)); s=4; break;
case FMT_SSHORT: printf("%hd",(signed short)Get16u(ValuePtr)); s=2; break;
case FMT_URATIONAL:
case FMT_SRATIONAL:
printf("%d/%d",Get32s(ValuePtr), Get32s(4+(char *)ValuePtr));
s = 8;
break;
case FMT_SINGLE: printf("%f",Get32f(ValuePtr)); s=4; break; // <<<<<<
case FMT_DOUBLE: printf("%f",Get64d(ValuePtr)); s=8; break; // <<<<<<
default:
printf("Unknown format %d:", Format);
return;
}
ByteCount -= s;
if (ByteCount <= 0) break;
printf(", ");
ValuePtr = (void *)((char *)ValuePtr + s);
}
if (n >= 16) printf("...");
}
double ConvertAnyFormat(void * ValuePtr, int Format)
{
double Value;
Value = 0;
switch(Format){
case FMT_SBYTE: Value = *(signed char *)ValuePtr; break;
case FMT_BYTE: Value = *(uchar *)ValuePtr; break;
case FMT_USHORT: Value = Get16u(ValuePtr); break;
case FMT_ULONG: Value = Get32u(ValuePtr); break;
case FMT_URATIONAL:
case FMT_SRATIONAL:
{
int Num,Den;
Num = Get32s(ValuePtr);
Den = Get32s(4+(char *)ValuePtr);
if (Den == 0){
Value = 0;
}else{
Value = (double)Num/Den;
}
break;
}
case FMT_SSHORT: Value = (signed short)Get16u(ValuePtr); break;
case FMT_SLONG: Value = Get32s(ValuePtr); break;
case FMT_SINGLE: Value = Get32f(ValuePtr); break; // <<<<<<
case FMT_DOUBLE: Value = Get64d(ValuePtr); break; // <<<<<<
default:
ErrNonfatal("Illegal format code %d",Format,0);
}
return Value;
}
【后记】
而网上反馈三星和华为的手机没这个问题,为此专门dump了三星s6的libjhead.so,发现这个库和我们自己的一模一样。
那为什么三星的不会crash呢?原来s6手机是64位的,它的内置应用也是64位的,而我们目前发现都是32位的机器。
s6上安装es文件浏览器,打开这个文件的时候也是crash,因为es文件浏览器是32位的。
用我们的64位的手机实验,其表现和三星一模一样。
那为什么同一套代码32位和64位表现不一样呢?
32位crash的指令如下:
23b6: ed90 2b00 vldr d2, [r0]
而对应64位的指令如下:
3b1c: bd400260 ldr d0, [x19]
通过log确定64位下这个地址也是基数,难道64位不对地址做对齐的要求?
于是查找Arm规范,发现如下说明:
@ARMv8_ISA_Overview_PRD03-GENC-010197-15-0.pdf 3 A64 OVERVIEW ... • Unaligned addresses are permitted for most loads and stores, including paired register accesses, floating point and SIMD registers, with the exception of exclusive and ordered accesses
A64架构中对地址对齐没有严格要求。
还有一个现象是Android N 32位机器上也不会复现为题,发现N上不再用libjhead.so了。
到此,所有的疑惑都已经解开。
double和float内存布局相关文章:http://bbs.csdn.net/topics/370168606