[python源码分析] python2.7中的str 和 unicode

在python3中的字符串str/Unicode一文中, 我们可以看到python3+中 str和Unicode是同一个东西；而在python2+中, 他们却不一样。本文以python2.7源码为基础进行简单介绍。

1. str#

1.1 PyStringObject对象#

在 stringobject.h 中的 35~49行可以看到PyStringObject定义

typedef struct {
    PyObject_VAR_HEAD
    long ob_shash;
    int ob_sstate;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements. 最后一位为'\0'
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     *     ob_sstate != 0 iff the string object is in stringobject.c's
     *       'interned' dictionary; in this case the two references
     *       from 'interned' to this object are *not counted* in ob_refcnt.
     */
} PyStringObject;

ob_shash 是该字符串的哈希值，由于 Python 的字典实现大量使用了哈希值，且字典的健多为 PyStringObject，预先计算哈希值并保存可以加速字典的运算。
ob_sstate 和字符串对象的 intern 机制有关

追根溯源, 在object.h 中可以看到PyObject_VAR_HEAD、PyObject_HEAD 、 _PyObject_HEAD_EXTRA 和 _object的定义

#define PyObject_VAR_HEAD               \
    PyObject_HEAD                       \
    Py_ssize_t ob_size; /* Number of items in variable part */

/* PyObject_HEAD defines the initial segment of every PyObject. */
#define PyObject_HEAD                   \
    _PyObject_HEAD_EXTRA                \
    Py_ssize_t ob_refcnt;               \
    struct _typeobject *ob_type;

/* Define pointers to support a doubly-linked list of all live heap objects. 双向链表 */
#define _PyObject_HEAD_EXTRA            \
    struct _object *_ob_next;           \
    struct _object *_ob_prev;

typedef struct _object {
    PyObject_HEAD
} PyObject;

因此，PyStringObject对象结构如下图所示

@startuml MicroService


class PyStringObject {
    -struct _object *_ob_next
    -struct _object *_ob_prev
    -- _PyObject_HEAD_EXTRA↑ --
    +Py_ssize_t ob_refcnt对象引用数
    +struct _typeobject *ob_type
    -- PyObject_HEAD↑  --
    +Py_ssize_t ob_size对象大小
    == PyObject_VAR_HEAD↑ ==
    +long ob_shash
    +int ob_sstate
    +char ob_sval[1]
}

@enduml

1.2 PyString_Type对象#

在 stringobject.c 中的3816~3858行可以找到PyString_Type的定义:

PyTypeObject PyString_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "str",
    PyStringObject_SIZE,
    sizeof(char),
    string_dealloc,                             /* tp_dealloc */
    (printfunc)string_print,                    /* tp_print */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_compare */
    string_repr,                                /* tp_repr */
    &string_as_number,                          /* tp_as_number */
    &string_as_sequence,                        /* tp_as_sequence */
    &string_as_mapping,                         /* tp_as_mapping */
    (hashfunc)string_hash,                      /* tp_hash 哈希方法*/ 
    0,                                          /* tp_call */
    string_str,                                 /* tp_str */
    PyObject_GenericGetAttr,                    /* tp_getattro */
    0,                                          /* tp_setattro */
    &string_as_buffer,                          /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
        Py_TPFLAGS_BASETYPE | Py_TPFLAGS_STRING_SUBCLASS |
        Py_TPFLAGS_HAVE_NEWBUFFER,              /* tp_flags */
    string_doc,                                 /* tp_doc */
    0,                                          /* tp_traverse */
    0,                                          /* tp_clear */
    (richcmpfunc)string_richcompare,            /* tp_richcompare 比较方法*/
    0,                                          /* tp_weaklistoffset */
    0,                                          /* tp_iter */
    0,                                          /* tp_iternext */
    string_methods,                             /* tp_methods 包含了join、split、lower、endwith、replace等方法的定义*/
    0,                                          /* tp_members */
    0,                                          /* tp_getset */
    &PyBaseString_Type,                         /* tp_base */
    0,                                          /* tp_dict */
    0,                                          /* tp_descr_get */
    0,                                          /* tp_descr_set */
    0,                                          /* tp_dictoffset */
    0,                                          /* tp_init */
    0,                                          /* tp_alloc */
    string_new,                                 /* tp_new */
    PyObject_Del,                               /* tp_free */
};

1.3 PyString_FromString方法#

最底层的生成字符串的函数方法为 PyString_FromString。

// stringobject.c
PyObject *
PyString_FromString(const char *str)
{
    ...
    size = strlen(str);
    ...

    /* Inline PyObject_NewVar */
    op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
    if (op == NULL)
        return PyErr_NoMemory();
    (void)PyObject_INIT_VAR(op, &PyString_Type, size);
    op->ob_shash = -1;                    # 暂不计算哈希值(使用时才计算 lazy_init)
    op->ob_sstate = SSTATE_NOT_INTERNED;  # 暂不使用intern 机制
    Py_MEMCPY(op->ob_sval, str, size+1);  // 将**原始C字串**拷贝给 ob_sval字段

    ...
    ...
    return (PyObject *) op;
}

该函数根据原始的 C 语言const char *str生成对应的 PyStringObject。原始字符串被复制到 ob_sval 中。

1.4 intern 机制#

str的intern 机制类似于小整数对象池，即若两个str对象的原始字符串相同，那么其 ob_sval 共享同一份内存。若程序中出现了 100 次 hello, world，那么在内存中只会保存一份。

intern 机制的核心在于字典 interned。该字典为 Python 的内建数据结构，可以简单等价于 C++ 的 map<T,R>。该字典的健值都为字符串本身 pystring:pystring，所有需 intern 的字符串会缓存到该 interned 字典中，当在程序中再遇到相同的字符串 pystring，便可通过字典在 O(1) 时间内检索出。

// stringobject.c
PyObject *
PyString_FromString(const char *str)
{
    ...
    ...
    /* share short strings */
    if (size == 0) {
        PyObject *t = (PyObject *)op;
        PyString_InternInPlace(&t);
        op = (PyStringObject *)t;
        nullstring = op;
        Py_INCREF(op);
    } else if (size == 1) {
        PyObject *t = (PyObject *)op;
        PyString_InternInPlace(&t);
        op = (PyStringObject *)t;
        characters[*str & UCHAR_MAX] = op;
        Py_INCREF(op);
    }
    return (PyObject *) op;
}

字符串拼接#

字符串虽然是变长对象，但并不是可变对象，创建之后，ob_sval 数组的长度无法再改变。在拼接两个字符串 s1, s2 时，必须重新生成一个 PyStringObject 对象来放置 s1->ob_sval + s2->sval。如果要连接 N 个 PyStringObject 对象，那么就必须进行 N-1 次的内存申请及内存搬运的工作。毫无疑问，这将严重影响 Python 的执行效率。

所以官方推荐的做法是使用 join 函数，该函数一次性分配好所有内存，然后统一搬运。

1
2
3

s = "-"
seq = ("a", "b", "c")
print s.join( seq )

2.Unicode#

2.1 PyUnicodeObject#

在 unicodeobject.h 中的 415~423行可以看到PyUnicodeObject定义如下

typedef struct {
    PyObject_HEAD
    Py_ssize_t length;          /* Length of raw Unicode data in buffer */
    Py_UNICODE *str;            /* Raw Unicode buffer */
    long hash;                  /* Hash value; -1 if not set */
    PyObject *defenc;           /* (Default) Encoded version as Python
                                   string, or NULL; this is used for
                                   implementing the buffer protocol */
} PyUnicodeObject;

照葫芦画瓢，PyUnicodeObject结构如下图所示

@startuml MicroService


class PyUnicodeObject {
    -struct _object *_ob_next
    -struct _object *_ob_prev
    -- _PyObject_HEAD_EXTRA↑ --
    +Py_ssize_t ob_refcnt对象引用数
    +struct _typeobject *ob_type
    == PyObject_HEAD↑  ==
    +Py_ssize_t length    /* Length of raw Unicode data in buffer */
    +Py_UNICODE *str      /* Raw Unicode buffer */
    +long hash           /* Hash value; -1 if not set */
    +PyObject *defenc   /*编码后的对象指针, 可能是assci、utf-8、latin-1 or mbcs格式, 默认为assci*/
}

@enduml

2.2 PyUnicode_Type#

在 unicodeobject.c 中的8902~8904行可以找到PyUnicode_Type的定义:

PyTypeObject PyUnicode_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "unicode",              /* tp_name */
    sizeof(PyUnicodeObject),        /* tp_size */
    0,                  /* tp_itemsize */
    /* Slots */
    (destructor)unicode_dealloc,    /* tp_dealloc */
    0,                  /* tp_print */
    0,                  /* tp_getattr */
    0,                  /* tp_setattr */
    0,                  /* tp_compare */
    unicode_repr,           /* tp_repr */
    &unicode_as_number,         /* tp_as_number */
    &unicode_as_sequence,       /* tp_as_sequence */
    &unicode_as_mapping,        /* tp_as_mapping */
    (hashfunc) unicode_hash,        /* tp_hash哈希方法*/
    0,                  /* tp_call*/
    (reprfunc) unicode_str,     /* tp_str 转换为str的方法*/
    PyObject_GenericGetAttr,        /* tp_getattro */
    0,                  /* tp_setattro */
    &unicode_as_buffer,         /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
    Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
    unicode_doc,            /* tp_doc */
    0,                  /* tp_traverse */
    0,                  /* tp_clear */
    PyUnicode_RichCompare,      /* tp_richcompare */
    0,                  /* tp_weaklistoffset */
    0,                  /* tp_iter */
    0,                  /* tp_iternext */
    unicode_methods,            /* tp_methods 包含了encode、split、join、endwith、replace等方法的定义*/
    0,                  /* tp_members */
    0,                  /* tp_getset */
    &PyBaseString_Type,         /* tp_base */
    0,                  /* tp_dict */
    0,                  /* tp_descr_get */
    0,                  /* tp_descr_set */
    0,                  /* tp_dictoffset */
    0,                  /* tp_init */
    0,                  /* tp_alloc */
    unicode_new,            /* tp_new */
    PyObject_Del,           /* tp_free */
};

reference#

Python 2.7 源码 - 字符串对象