How to properly deepcopy linked objects in C Extension

29 Views Asked by At

In Python, I have a pair of classes like

class Internal:
    def __init__(self, ref):
        self.ref = ref

class External:
    def __init__(self):
        self.storage = [1,2,3]
        self.int = Internal(self.storage)

    def change(self):
        self.storage[1] = 10

Deep copy of External works incredibly well

from copy import deepcopy

s = External()
s1 = deepcopy(s)
s.change()
print(s.int.ref)  # [1,10,3]
print(s1.int.ref)   # [1,2,3]

I want to implement Internal using Python C Extension. As far as I understand I have to implement either __reduce__ or __deepcopy__. I decided to go with __deepcopy__. Currently, I implemented it as

PyObject* internal_deepcopy(InternalObj* self, PyObject* memo) {
    InternalObj* obj = reinterpret_cast<InternalObj*>(PyType_GenericNew(Py_TYPE(self), nullptr, nullptr));
    if (!obj) return nullptr;
    obj->ref = self->ref;
    Py_INCREF(obj->ref);
    return reinterpret_cast<PyObject*>(obj);
}

And obviously, it is wrong. If I use such deep copy then I do not get the new reference to the new storage in copied External.

from internals import Internal

class External:
    def __init__(self):
        self.storage = [1,2,3]
        self.int = Internal(self.storage)

    def change(self):
        self.storage[1] = 10

s = External()
s1 = deepcopy(s)
s.change()
s.int.print()  # 1 10 3
s1.int.print()  # 1 10 3 

That is objects were not separated.

The question: how can I track that External class is copied and set a correct (like it works in pure Python code) reference to its storage in Internal?


Other parts of my Python module

#include <Python.h>
#include <iostream>

struct InternalObj {
    PyObject_HEAD
    PyObject* ref;
};

PyObject* internal_print(InternalObj* self, PyObject* unused) {
    const int size = PySequence_Fast_GET_SIZE(self->ref);
    PyObject** items = PySequence_Fast_ITEMS(self->ref);
    for (int i{}; i < size; ++i) std::cout << PyLong_AsLong(items[i]) << ' ';
    std::cout << std::endl;
    return Py_None;
}

PyMethodDef internal_methods[]{
    {"__deepcopy__", (PyCFunction)internal_deepcopy, METH_O, 0},
    {"print", (PyCFunction)internal_print, METH_NOARGS, 0},
    {nullptr, nullptr},
};

void internal_dealloc(InternalObj* self) {
    Py_DECREF(self->ref);  // release
    Py_TYPE(self)->tp_free(self);
}

PyObject* internal_new(PyTypeObject* subtype, PyObject* args, PyObject* kwds) {
    PyObject* ref;
    const char* kwlist[]{"ref", nullptr};
    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:__init__", const_cast<char**>(kwlist), &ref)) return nullptr;
    InternalObj* obj = reinterpret_cast<InternalObj*>(PyType_GenericNew(subtype, nullptr, nullptr));
    if (!obj) return nullptr;
    obj->ref = ref;
    Py_INCREF(ref);  // capture
    return reinterpret_cast<PyObject*>(obj);
}

PyTypeObject internal_type{
    PyVarObject_HEAD_INIT(nullptr, 0)
    "internals.Internal",  // tp_name
    sizeof(InternalObj),  // tp_basicsize
    0,  // tp_itemsize
    (destructor)internal_dealloc,  // tp_dealloc
    0,  // tp_vectorcall_offset
    0,  // tp_getattr
    0,  // tp_setattr
    0,  // tp_as_async
    0,  // tp_repr
    0,  // tp_as_number
    0,  // tp_as_sequence
    0,  // tp_as_mapping
    0,  // tp_hash
    0,  // tp_call
    0,  // tp_str
    0,  // tp_getattro
    0,  // tp_setattro
    0,  // tp_as_buffer
    Py_TPFLAGS_DEFAULT,  // tp_flags
    0,  // tp_doc
    0,  // tp_traverse
    0,  // tp_clear
    0,  // tp_richcompare
    0,  // tp_weaklistoffset
    0,  // tp_iter
    0,  // tp_iternext
    internal_methods,  // tp_methods
    0,  // tp_members
    0,  // tp_getset
    0,  // tp_base
    0,  // tp_dict
    0,  // tp_descr_get
    0,  // tp_descr_set
    0,  // tp_dictoffset
    0,  // tp_init
    0,  // tp_alloc
    internal_new,  // tp_new
};

PyModuleDef internals_module{
    PyModuleDef_HEAD_INIT,
    "internals",
    "Python interface for internals",
    -1,
    nullptr,
    nullptr,
    nullptr,
    nullptr,
    nullptr
};

PyMODINIT_FUNC PyInit_internals() {
    PyObject *module = PyModule_Create(&internals_module);
    if (PyType_Ready(&internal_type) < 0) return nullptr;
    PyModule_AddObject(module, "Internal", Py_NewRef(&internal_type));
    return module;
}

Meson script that I use to build.

project('ints', 'c', 'cpp', version: '0.1', default_options: ['c_std=c18', 'cpp_std=c++20', 'b_ndebug=if-release'])
py_installation = import('python').find_installation('python3', required: true)
py_installation.extension_module('internals', 'internal.cpp', dependencies: py_installation.dependency())
1

There are 1 best solutions below

0
Askold Ilvento On

It looks scary but technically is quite straightforward. There are two parts of the answer.

  1. How does Python deep copy loops? E.g.:
l = []
l.append(l)
q = deepcopy(l)

works without issue.

deepcopy temporarily memorizes copied objects in its second argument memo in the form {id(old): new}. If the object was already processed deepcopy returns a created copy from memo. If not, it creates a new object and deepcopy all its references.

  1. How should this be applied in C API?

Import deepcopy (in my case this part can be reduced because I know that my ref must be already in memo, but in case it is not)

PyObject *copy = PyImport_ImportModule("copy");
deepcopy = PyObject_GetAttrString(copy, "deepcopy");

because list (and many other objects) does not have __deepcopy__. Its deep copy is implemented in Python in the copy module. I've seen that some people are not afraid to import in deep copy, but I would suggest doing it in module init (PyInit_internals in my case).

Implement the correct version of deep copy )))

PyObject* internal_deepcopy(InternalObj* self, PyObject* memo) {
    PyObject* id = PyLong_FromLong(static_cast<long>(self));  // need this id as an object to interact with `memo`
    if (!id) return nullptr;
    if (memo && memo != Py_None) {
        PyObject* memed = PyDict_GetItem(memo, id);
        if (memed) {
            Py_DECREF(id);
            return memed;
        }
        Py_INCREF(memo);  // to unify exit code with next branch where `memo` is created. 
    } else memo = PyDict_New();  // top-level call
    InternalObj* obj = reinterpret_cast<InternalObj*>(PyType_GenericNew(Py_TYPE(self), nullptr, nullptr));    // create copy of internals of this object
    if (!obj) {
        Py_DECREF(id);
        Py_DECREF(memo);
        return nullptr;
    }
    if (PyDict_SetItem(memo, id, reinterpret_cast<PyObject*>(obj)) < 0) {  // update `memo`
        Py_DECREF(id);
        Py_DECREF(memo);
        Py_DECREF(obj);
        return nullptr;
    }
    Py_DECREF(id);
    obj->ref = PyObject_CallFunctionObjArgs(deepcopy, self->ref, memo, nullptr);  // call deepcopy for `ref`
    Py_DECREF(memo);  // delete map if it was created in this deepcopy
    if (!obj->ref) {
        Py_DECREF(obj);
        return nullptr;
    }
    return reinterpret_cast<PyObject*>(obj);  // return copied object.
}

I'm not sure, the code above covered a generic case. For instance, in copy.deepcopy I see some manipulation with objects live time that is missed here. But for simple cases it should be correct.