When I add columns to dataframes stored in a shelf object I am seeing exponential growth in the shelf file size. It is like a complete copy of an entry is created, with the added column applied, every time I add a new column. The number of keys does not change. The new columns are present when accessing modified dataframes. The size of the dataframes in bytes increases, but not in proportion to the file size.
import os, sys
import shelve
import numpy as np
import pandas as pd
class shelf_class():
def __init__(self, dbfile ):
# A database of Dataframes
self._db = shelve.open(dbfile)
def __getitem__(self, key):
return self._db[key]
def __setitem__(self, key, value):
self._db[key] = value
def __delitem__(self, key):
del self._db[key]
def add_sum(self, key):
new_data = self._db[key]
new_data['sum'] = self._db[key].to_numpy().sum(axis=1)
self._db[key] = new_data
def add_mean(self, key):
new_data = self._db[key]
new_data['mean'] = self._db[key].to_numpy().mean(axis=1)
self._db[key] = new_data
filename = 'my_file'
store = shelf_class('my_file')
keys = [str(x) for x in range(10)]
for i in keys:
store[i] = pd.DataFrame(np.random.random((100,10)))
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
for i in keys:
store.add_sum(i)
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
for i in keys:
store.add_mean(i)
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
This produces:
file size = 86917
dataframe size = 8144
file size = 179152
dataframe size = 8544
file size = 276416
dataframe size = 8944
What am I doing wrong?
This is a known problem with shelve. This post
Shelve dictionary size is >100Gb for a 2Gb text file
provides a method to clean up and defrag the database file using:
This requires the file to be a dbm.gnu, which can be checked using: