Problem
I used defaultdict to specify the dtype of each column. When I tried to ffill the dataframe, I got the following error. I think the error was caused by the dtype, but I have to specify it because my data is too large (1394265 rows × 300 columns) and I don't have enough memory. How can I solve the problem and ffill correctly?
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-42-feb22c34d9cb> in <cell line: 4>()
2 dtype['timestamp']=np.int64
3 table1=pd.read_csv(data_root+'err_reprdc_sample.csv',index_col=0,dtype=dtype)
----> 4 table1.ffill()
14 frames
/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
332
333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in ffill(self, axis, inplace, limit, downcast)
11781 downcast: dict | None = None,
11782 ) -> DataFrame | None:
> 11783 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
11784
11785 @overload
/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
332
333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in ffill(self, axis, inplace, limit, downcast)
6985 Object with missing values filled or None if ``inplace=True``.
6986 """
-> 6987 return self.fillna(
6988 method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
6989 )
/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
332
333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in fillna(self, value, method, axis, inplace, limit, downcast)
5633 downcast: dict | None = None,
5634 ) -> DataFrame | None:
-> 5635 return super().fillna(
5636 value=value,
5637 method=method,
/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
6811 return result
6812
-> 6813 new_data = self._mgr.interpolate(
6814 method=method,
6815 axis=axis,
/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in interpolate(self, **kwargs)
420
421 def interpolate(self: T, **kwargs) -> T:
--> 422 return self.apply("interpolate", **kwargs)
423
424 def shift(self: T, periods: int, axis: int, fill_value) -> T:
/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
350 applied = b.apply(f, **kwargs)
351 else:
--> 352 applied = getattr(b, f)(**kwargs)
353 except (TypeError, NotImplementedError):
354 if not ignore_failures:
/usr/local/lib/python3.10/dist-packages/pandas/core/internals/blocks.py in interpolate(self, method, axis, index, inplace, limit, limit_direction, limit_area, fill_value, downcast, **kwargs)
1256 data = cast(np.ndarray, data) # bc overridden by ExtensionBlock
1257
-> 1258 missing.interpolate_array_2d(
1259 data,
1260 method=method,
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in interpolate_array_2d(data, method, axis, index, limit, limit_direction, limit_area, fill_value, coerce, downcast, **kwargs)
237 raise ValueError("Cannot pass both fill_value and method")
238
--> 239 interpolate_2d(
240 data,
241 method=m,
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in interpolate_2d(values, method, axis, limit, limit_area)
813 # _pad_2d and _backfill_2d both modify tvalues inplace
814 if method == "pad":
--> 815 _pad_2d(tvalues, limit=limit)
816 else:
817 _backfill_2d(tvalues, limit=limit)
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in new_func(values, limit, mask)
847 return result.view(values.dtype), mask
848
--> 849 return func(values, limit=limit, mask=mask)
850
851 return cast(F, new_func)
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in _pad_2d(values, limit, mask)
879
880 if np.all(values.shape):
--> 881 algos.pad_2d_inplace(values, mask, limit=limit)
882 else:
883 # for test coverage
/usr/local/lib/python3.10/dist-packages/pandas/_libs/algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()
TypeError: No matching signature found
Reproduce the problem
import pandas as pd
import numpy as np
from collections import defaultdict
data_root='./'
dtype=defaultdict(np.float16)
dtype['timestamp']=np.int64
table1=pd.read_csv(data_root+'err_reprdc_sample.csv',index_col=0,dtype=dtype)
table1.ffill()
You can download the sample file called 'err_reprdc_sample' and adjust the data_root.
| timestamp | 000001.XSHE | 000002.XSHE | 000596.XSHE |
|---|---|---|---|
| 20220628092500000 | 14.47 | ||
| 20220628092500010 | 18.5 | ||
| 20220628092500020 | 234.1 | ||
| 20220628092500040 | |||
| 20220628092500050 |
What I have tried
table1=pd.read_csv(data_root+'err_reprdc_sample.csv',index_col=0)
table1.ffill()
or Changing np.float16 to np.float32 works fine but it will take too much memory.
Try this:
Or:
Explanation
The type
np.float16can sometimes behave unpredictably with certain pandas operations, as it is not as widely supported or tested asnp.float32ornp.float64. Your code is not working because of that.In the first solution, I'm simply converting the data types prior to calling
ffill. In the second solution, I've replacednp.float16withnp.float64. Both modifications to the original code were made due to the reason previously stated.