I have a very large pandas dataset and when I try to use set_index, I get an out-of-memory error, which I can somehow understand.
I would like to transform the datetime into just giving me the date for the index, which works normally with df.set_index(df.index.date, inplace=True).
I thought I could work around this using dask.dataframe, but I haven't found a way to get it working. I have tried different methods I found on SO (I will link them when I find them again) but to be honest, I do not understand well the what I'm doing there. Any help much appreciated!
import numpy as np
import dask.dataframe as dd
import pandas as pd
import datetime as dt
def make_df(date0,x,y):
''' simulate my data (normally much larger) '''
xx,yy = np.meshgrid(x,y)
zz = np.sin(xx/20) *np.cos(yy/30)
tt = pd.date_range(date0, freq='92min', periods=zz.size)
arr = np.full((xx.size,3),np.nan)
arr[:,0] = xx.flatten()
arr[:,1] = yy.flatten()
arr[:,2] = zz.flatten() + np.random.randn(zz.size) / 10
df = pd.DataFrame(arr, columns=['x','y','z'], index=tt)
df.index.name = 'time'
return df
# create dataset
dist = 2
date0 = dt.datetime(2023, 9, 23)
x = np.arange(-10,60,dist)
y = np.arange(30,70,dist)
df = make_df(date0,x,y)
df = dd.from_pandas(df, npartitions=150)
print(df)
### ValueError: Metadata inference failed in `lambda`.
# df.map_partitions(lambda x:
# x.set_index(df.index.dt.date)
# ).persist()
### AttributeError: 'Index' object has no attribute 'dt'
def func(x):
return pd.to_datetime(x.index.dt.date, utc=True)
meta = pd.Series([], name='time')
df['time'] = df.apply(func, axis=1, meta=meta)
df = df.set_index('time', #drop=False, compute=True
)
df.compute()
print(df)
return