Serializing with dependencies in python
A tutorial on how to hack dill to store neccesarry imports along with your pickled object
When you want to pickle a python object for long term storage, you can run into a problem: pickle does not store object definitions when it pickles. So for example when you build a class Greeter
and then pickle it and unpickle it at another location, you already need to have class Greeter
correctly defined before you can load the pickle at the target destination.
def greeting1():
return "Booyaa!"
def greeting2():
return "Howdy!"
class Greeter:
def __init__(self, greetings):
self.greetings = greetings
def greet(self):
for greet in self.greetings:
print(greet())
greeter = Greeter([greeting1, greeting2])
greeter.greet()
import pickle
pickle.dump(greeter, open("greeter.pkl", "wb"))
If you now try to load the greeter somewhere else, you will get an AttributeError
:
> >> import pickle
>>> pickle.load(open("greeter.pkl", "rb"))
Traceback (most recent call last):File "<stdin>", line 1, in <module>AttributeError: Can't get attribute 'Greeter' on <module '__main__' (built-in)>
However, now you have created a dependency that you need to manage. You always have to make sure that you have the right version of the right module at hand when you want to unpickle. Especially for long term storage of python objects, this is begging for problems. It would be nicer if you could have the object itself and the definition all in one file!
Luckily there is a stand-in replacement for pickle
called dill
that unfortunately does not come with the standard library, so you have to install it yourself: pip install dill
.
The nice thing about dill
is that it stores definitions along with the object, as long as they are defined in __main__
. In our case they are so when we store the greeter instance with dill we can actually reload the object now:
import dill
dill.dump(greeter, open("greeter.pkl", "wb"))
>>> import dill
>>> greeter = dill.load(open("greeter.pkl", "rb"))
>>> greeter.greet()
Booyaa!
Howdy!
It worked!
Suppose we define Greeter
in a greeter.py
module:
greeter.py:
def greeting1():
return "Booyaa!"
def greeting2():
return "Howdy!"
class Greeter:
def __init__(self, greetings):
self.greetings = greetings
def greet(self):
for greet in self.greetings:
print(greet())
import dill
from greeter import Greeter, greeting1, greeting2
g = Greeter([greeting1, greeting2])
dill.dump(g, open("greeter.pkl", "wb"))
Then the pickle again fails to load when greetings.py
is either missing or misses the right definitions:
> >> import dill
>>> greeter = dill.load(open("_notebooks/greeter.pkl", "rb"))
Traceback (most recent call last):File "<stdin>", line 1, in <module> File "/Users/oege/.pyenv/versions/3.8.5/lib/python3.8/site-packages/dill/_dill.py", line 278, in load
return Unpickler(file, ignore=ignore, **kwds).load()
File "/Users/oege/.pyenv/versions/3.8.5/lib/python3.8/site-packages/dill/_dill.py", line 481, in load
obj = StockUnpickler.load(self)
File "/Users/oege/.pyenv/versions/3.8.5/lib/python3.8/site-packages/dill/_dill.py", line 471, in find_class
return StockUnpickler.find_class(self, module, name)
ModuleNotFoundError: No module named 'greeter'
You can work around this problem by mainifying
the imported definitions:
import dill
from greeter import Greeter, greeting1, greeting2
def mainify(obj):
"""If obj is not defined in __main__ then redefine it in
main so that dill will serialize the definition along with the object"""
if obj.__module__ != "__main__":
import __main__
import inspect
s = inspect.getsource(obj)
co = compile(s, '<string>', 'exec')
exec(co, __main__.__dict__)
mainify(Greeter)
mainify(greeting1)
mainify(greeting2)
print(Greeter.__module__, greeting1.__module__, greeting2.__module__)
g = Greeter([greeting1, greeting2])
dill.dump(g, open("greeter.pkl", "wb"))
And this works:
>>> import dill
>>> greeter = dill.load(open("_notebooks/greeter.pkl", "rb"))
>>> greeter.greet()
Booyaa!
Howdy!
greeter2.py:
class Greeting:
def __init__(self, greetings):
self.greetings = greetings
@classmethod
def dillable(cls, greetings):
import __main__
for greeting in greetings:
cls._mainify(greeting)
cls._mainify(cls)
cls = getattr(__main__, cls.__name__)
greetings = [getattr(__main__, greeting.__name__) for greeting in greetings]
return cls(greetings)
@staticmethod
def _mainify(obj):
"""If obj is not defined in __main__ then redefine it in
main so that dill will serialize the definition along with the object"""
if obj.__module__ != "__main__":
import __main__
import inspect
s = inspect.getsource(obj)
co = compile(s, '<string>', 'exec')
exec(co, __main__.__dict__)
Now you can import in main:
import dill
from greeter2 import Greeter, greeting1, greeting2
g = Greeter.dillable([greeting1, greeting2])
g.greet()
dill.dump(g, open("greeter.pkl", "wb"))
greeter3.py:
class Greeting:
def __init__(self, greetings):
self.greetings = greetings
def __new__(cls, greetings=None):
import __main__
if greetings is not None:
cls._mainify(cls)
cls = getattr(__main__, cls.__name__)
obj = object.__new__(cls)
if greetings is not None:
for greeting in greetings:
cls._mainify(greeting)
greetings = [getattr(__main__, greeting.__name__) for greeting in greetings]
obj.__init__(greetings)
return obj
@staticmethod
def _mainify(obj):
"""If obj is not defined in __main__ then redefine it in
main so that dill will serialize the definition along with the object"""
if obj.__module__ != "__main__":
import __main__
import inspect
s = inspect.getsource(obj)
co = compile(s, '<string>', 'exec')
exec(co, __main__.__dict__)
import dill
from greeter3 import Greeter, greeting1, greeting2
g = Greeter([greeting1, greeting2])
g.greet()
dill.dump(g, open("greeter.pkl", "wb"))