zoukankan      html  css  js  c++  java
  • Python 源码剖析(三)【字符串对象】

    三、字符串对象

    1、PyStringObjectPyString_Type

    2、创建PyStringObject对象

    3、Intern 机制

    4、字符缓冲池

    5、PyStringObject 效率相关问题

    6、Hack PyStringObject


    1、PyStringObjectPyString_Type

    PyStringObject 对象的声明:

    [stringobject.h] 
    
    typedef struct {
    
        PyObject_VAR_HEAD
    
        long ob_shash;
    
        int ob_sstate;
    
        char ob_sval[1];
    
    } PyStringObject;
    PyObject_VAR_HEAD 中含有字符串长度ob_size,ob_sval指向字符串存储内存,大小为ob_size+1,且ob_sval[ob_size]='';

    ob_shash保存字符串哈希值,没计算过的默认为0,计算方法为:
    [stringobject.c]
    
    static long string_hash(PyStringObject *a)
    
    {
    
        register int len;
    
        register unsigned char *p;
    
        register long x;
    
     
    
        if (a->ob_shash != -1)
    
            return a->ob_shash;
    
        len = a->ob_size;
    
        p = (unsigned char *) a->ob_sval;
    
        x = *p << 7;
    
        while (--len >= 0)
    
            x = (1000003*x) ^ *p++;
    
        x ^= a->ob_size;
    
        if (x == -1)
    
            x = -2;
    
        a->ob_shash = x;
    
        return x;
    
    }

    ob_sstate表示该对象是否被Intern。

    PyStringObject对应的类型对象:

    [stringobject.c] 
    
    PyTypeObject PyString_Type = {
    
        PyObject_HEAD_INIT(&PyType_Type)
    
        0,
    
        "str",
    
        sizeof(PyStringObject),
    
        sizeof(char),
    
        ……
    
        (reprfunc)string_repr,          /* tp_repr */
    
        &string_as_number,          /* tp_as_number */
    
        &string_as_sequence,            /* tp_as_sequence */
    
        &string_as_mapping,         /* tp_as_mapping */
    
        (hashfunc)string_hash,          /* tp_hash */
    
        0,                  /* tp_call */
    
        ……
    
        string_new,             /* tp_new */
    
        PyObject_Del,                       /* tp_free */
    
    };

    2、创建PyStringObject对象

    最一般的方法PyString_FromString:

    [stringobject.c] 
    
    PyObject *
    
    PyString_FromString(const char *str)
    
    {
    
        register size_t size;
    
        register PyStringObject *op;
    
     
    
    assert(str != NULL);
    
    /*判断字符串长度*/
    
        size = strlen(str);
    
        if (size > INT_MAX) {
    
            PyErr_SetString(PyExc_OverflowError,
    
                "string is too long for a Python string");
    
            return NULL;
    
    }
    
     
    
    /*处理null string*/
    
        if (size == 0 && (op = nullstring) != NULL) {
    
    #ifdef COUNT_ALLOCS
    
            null_strings++;
    
    #endif
    
            Py_INCREF(op);
    
            return (PyObject *)op;
    
        }
    
        if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
    
    #ifdef COUNT_ALLOCS
    
            one_strings++;
    
    #endif
    
            Py_INCREF(op);
    
            return (PyObject *)op;
    
        }
    
     
    
        /* 创建新的PyStringObject对象,并初始化 */
    
        /* Inline PyObject_NewVar */
    
        op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
    
        if (op == NULL)
    
            return PyErr_NoMemory();
    
        PyObject_INIT_VAR(op, &PyString_Type, size);
    
        op->ob_shash = -1;
    
        op->ob_sstate = SSTATE_NOT_INTERNED;
    
    memcpy(op->ob_sval, str, size+1);
    
     
    
        /* Itern(共享)长度较短的PyStringObject对象 */
    
        if (size == 0) {
    
            PyObject *t = (PyObject *)op;
    
            PyString_InternInPlace(&t);
    
            op = (PyStringObject *)t;
    
            nullstring = op;
    
            Py_INCREF(op);
    
        } else if (size == 1) {
    
            PyObject *t = (PyObject *)op;
    
            PyString_InternInPlace(&t);
    
            op = (PyStringObject *)t;
    
            characters[*str & UCHAR_MAX] = op;
    
            Py_INCREF(op);
    
        }
    
        return (PyObject *) op;
    
    }

    首先判断有没有超过长度限制(INT_MAX 2147483647 ),然后判断是不是为空串,如果是并且之前创建过空串对象就直接返回,接着就判断其是否为一个字符,是而且在Intern机制中创建过就直接返回,否则,就开始申请内存(PyStringObject + size),创建对象并返回。

    另一个创建PyStringObject对象的途径:

    [stringobject.c]
    
    PyObject* PyString_FromStringAndSize(const char *str, int size)
    
    {
    
        register PyStringObject *op;
    
    /*处理null string*/
    
    if (size == 0 && (op = nullstring) != NULL) {
    
    #ifdef COUNT_ALLOCS
    
            null_strings++;
    
    #endif
    
            Py_INCREF(op);
    
            return (PyObject *)op;
    
        }
    
        if (size == 1 && str != NULL &&
    
            (op = characters[*str & UCHAR_MAX]) != NULL)
    
        {
    
    #ifdef COUNT_ALLOCS
    
            one_strings++;
    
    #endif
    
            Py_INCREF(op);
    
            return (PyObject *)op;
    
        }
    
     
    
        /* Inline PyObject_NewVar */
    
        op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
    
        if (op == NULL)
    
            return PyErr_NoMemory();
    
        PyObject_INIT_VAR(op, &PyString_Type, size);
    
        op->ob_shash = -1;
    
        op->ob_sstate = SSTATE_NOT_INTERNED;
    
        if (str != NULL)
    
            memcpy(op->ob_sval, str, size);
    
        op->ob_sval[size] = '';
    
        /* share short strings */
    
        if (size == 0) {
    
            PyObject *t = (PyObject *)op;
    
            PyString_InternInPlace(&t);
    
            op = (PyStringObject *)t;
    
            nullstring = op;
    
            Py_INCREF(op);
    
        } else if (size == 1 && str != NULL) {
    
            PyObject *t = (PyObject *)op;
    
            PyString_InternInPlace(&t);
    
            op = (PyStringObject *)t;
    
            characters[*str & UCHAR_MAX] = op;
    
            Py_INCREF(op);
    
        }
    
        return (PyObject *) op;
    
    }

    和前一个差不多。


     

    3、Intern 机制

    现在看下前面提到的Intern机制,即创建PyStringObject对象函数里的PyString_InternInPlace:
    [stringobjec.c]
    
    void PyString_InternInPlace(PyObject **p)
    
    {
    
        register PyStringObject *s = (PyStringObject *)(*p);
    
        PyObject *t;
    
        if (s == NULL || !PyString_Check(s))
    
            Py_FatalError("PyString_InternInPlace: strings only please!");
    
        /* If it's a string subclass, we don't really know what putting
    
           it in the interned dict might do. */
    
        if (!PyString_CheckExact(s))
    
            return;
    
        if (PyString_CHECK_INTERNED(s))
    
            return;
    
     
    
        if (interned == NULL) {
    
            interned = PyDict_New();
    
            if (interned == NULL) {
    
                PyErr_Clear(); /* Don't leave an exception */
    
                return;
    
            }
    
        }
    
        t = PyDict_GetItem(interned, (PyObject *)s);
    
        if (t) {
    
            Py_INCREF(t);
    
            Py_DECREF(*p);
    
            *p = t;
    
            return;
    
        }
    
     
    
        if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
    
            PyErr_Clear();
    
            return;
    
        }
    
        /* The two references in interned are not counted by refcnt.
    
           The string deallocator will take care of this */
    
        s->ob_refcnt -= 2;
    
        PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
    
    }

    首先进行一系列检测,而核心在于 interned(PyDictObject对象),如果创建对象在interned中,增加interned中对象的引用计数,减少传入对象的引用计数(其为临时变量);如果创建对象不在interned中,将创建对象加入interned,并将其引用计数减2(interned中对象的引用应当无效,否则无法删除)。对象引用计数为0时会被销毁:

    [stringobject.c] 
    
    static void string_dealloc(PyObject *op)
    
    {
    
        switch (PyString_CHECK_INTERNED(op)) {
    
            case SSTATE_NOT_INTERNED:
    
                break;
    
     
    
            case SSTATE_INTERNED_MORTAL:
    
                /* revive dead object temporarily for DelItem */
    
                op->ob_refcnt = 3;
    
                if (PyDict_DelItem(interned, op) != 0)
    
                    Py_FatalError(
    
                        "deletion of interned string failed");
    
                break;
    
     
    
            case SSTATE_INTERNED_IMMORTAL:
    
                Py_FatalError("Immortal interned string died.");
    
     
    
            default:
    
                Py_FatalError("Inconsistent interned string state.");
    
        }
    
        op->ob_type->tp_free(op);
    
    }

    4、字符缓冲池

    PyStringObject 为一个字节的字符对象设计了一个类似PyIntObject小整数对象池一样的对象池characters:

    static PyStringObject *characters[UCHAR_MAX + 1]; 

    #define UCHAR_MAX     0xff      /* maximum unsigned char value */

    由字符串创建的代码可看出characters缓冲池在字符串生成时才开始生成。

    虽然在创建PyStringObject时Intern机制只作用于 空串和字符(对应nullstring和characters),但Intern机制会作用到其他地方。


    5、PyStringObject 效率相关问题

    字符串连接问题,用 '+' 效率低下,连接N个PyStringObject对象将进行N-1次内存申请及搬运,推荐使用join(只分配一次内存)。

    '+' 调用string_concat:

    [stringobject.c]
    
    static PyObject* string_concat(register PyStringObject *a, register PyObject *bb)
    
    {
    
        register unsigned int size;
    
        register PyStringObject *op;
    
    #define b ((PyStringObject *)bb)
    
        ……
    
        size = a->ob_size + b->ob_size;
    
        /* Inline PyObject_NewVar */
    
        op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
    
        if (op == NULL)
    
            return PyErr_NoMemory();
    
        PyObject_INIT_VAR(op, &PyString_Type, size);
    
        op->ob_shash = -1;
    
        op->ob_sstate = SSTATE_NOT_INTERNED;
    
        memcpy(op->ob_sval, a->ob_sval, (int) a->ob_size);
    
        memcpy(op->ob_sval + a->ob_size, b->ob_sval, (int) b->ob_size);
    
        op->ob_sval[size] = '';
    
        return (PyObject *) op;
    
    #undef b
    
    }

    而join则这样(先计算处list或tuple中PyStringObject的总大小再申请一次内存):

    [stringobject.c]
    
    static PyObject* string_join(PyStringObject *self, PyObject *orig)
    
    {
    
        char *sep = PyString_AS_STRING(self);
    
        const int seplen = PyString_GET_SIZE(self);
    
        PyObject *res = NULL;
    
        char *p;
    
        int seqlen = 0;
    
        size_t sz = 0;
    
        int i;
    
        PyObject *seq, *item;
    
    。。。。。。//获得list中PyStringObject对象的个数,保存在seqlen中
    
     
    
        for (i = 0; i < seqlen; i++) 
    
    {
    
            const size_t old_sz = sz;
    
            item = PySequence_Fast_GET_ITEM(seq, i);
    
            sz += PyString_GET_SIZE(item);
    
            if (i != 0)
    
                sz += seplen;
    
        }
    
    /* 申请内存空间 */
    
        res = PyString_FromStringAndSize((char*)NULL, (int)sz);
    
        /* 连接list中的每一个PyStringObject对象*/
    
        p = PyString_AS_STRING(res);
    
    for (i = 0; i < seqlen; ++i) 
    
    {
    
            size_t n;
    
            /* 获得list中的一个PyStringObject对象*/
    
            item = PySequence_Fast_GET_ITEM(seq, i);
    
            n = PyString_GET_SIZE(item);
    
            memcpy(p, PyString_AS_STRING(item), n);
    
            p += n;
    
            if (i < seqlen - 1) 
    
            {
    
                memcpy(p, sep, seplen);
    
                p += seplen;
    
            }
    
        }
    
        Py_DECREF(seq);
    
        return res;
    
    }

    6、Hack PyStringObject

    可在string_length中添加打印地址代码,运行len()时可观察到相同的字符串地址一样,这就是Intern机制的作用;可在len()中添加代码:

    static void ShowCharater() 
    
    { 
    
       char a = 'a'; 
    
       PyStringObject** posA = characters+(unsigned short)a; 
    
       int i; 
    
       for(i = 0; i < 5; ++i) 
    
       { 
    
          PyStringObject* strObj = posA[i]; 
    
          printf("%s, %d
    ", strObj->ob_sval, strObj->ob_refcnt); 
    
       } 
    
    }

    观察是否使用缓冲池对象。

  • 相关阅读:
    PowerMock 遇到的问题——2
    PowerMock遇到的问题——3
    PowerMock使用遇到的问题——2
    PowerMock使用遇到的问题——1
    PowerMock使用遇到的一些问题
    PowerMock.expectNew(Class<T> type, Class<?>[] parameterTypes, Object... arguments)
    PowerMock与EasyMock的应用(转)
    EasyMock的原理及使用方法
    一个div相对于外层的div水平和垂直居中
    SAP 采购订单收货时报错:对于采购订单xxxx无收货可能
  • 原文地址:https://www.cnblogs.com/GO-NO-1/p/6511993.html
Copyright © 2011-2022 走看看