Is there a polars equivalent to np.outer()? np.outer() returns the outer product of two vectors. I cannot find an expressive way to represent this in Polars.
Polars equivalent to np.outer()
67 Views Asked by Jage At
2
There are 2 best solutions below
0
On
As was suggested in a comment, here is an implementation using a cross join. The approach taken here works with polars.LazyFrames (and should therefore work with polars.DataFrames as well) I’ve also included @Henricks solution for verification and for timing (see below)
import numpy as np
import polars as pl
def crossjoin_mul(df, cola, colb):
nrows = pl.count().sqrt().cast(pl.Int32)
return (
df.select(cola).join(df.select(colb), how='cross')
.select(computed=cola * colb)
.group_by(pl.arange(0, pl.count()) // nrows, maintain_order=True)
.agg('computed')
.select(pl.col('computed').list.to_struct())
.unnest('computed')
)
def row_expr_mul(df, cola, colb):
'''
This will not work on a polars.LazyFrame since the number of expressions
generated is dependent on the number of rows in the DataFrame.
'''
return df.select(
(pl.col(cola) * pl.col(colb).get(idx)).alias(str(idx))
for idx in range(len(df))
)
df = pl.DataFrame({
"a": [0, 1, 2],
"b": [0, 1, 2],
})
ldf = df.lazy()
print(
(np_res := np.outer(df['a'], df['b'])),
(result := ldf.pipe(crossjoin_mul, pl.col('a'), pl.col('b')).collect()),
f'{(result.to_numpy() == np_res).all() = }\n',
(result := df.pipe(row_expr_mul, 'a', 'b')),
f'{(result.to_numpy() == np_res).all() = }\n',
sep='\n',
)
# [[0 0 0]
# [0 1 2]
# [0 2 4]]
#
# shape: (3, 3)
# ┌─────────┬─────────┬─────────┐
# │ field_0 ┆ field_1 ┆ field_2 │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 │
# ╞═════════╪═════════╪═════════╡
# │ 0 ┆ 0 ┆ 0 │
# │ 0 ┆ 1 ┆ 2 │
# │ 0 ┆ 2 ┆ 4 │
# └─────────┴─────────┴─────────┘
# (result.to_numpy() == np_res).all() = True
# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ 0 ┆ 1 ┆ 2 │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╡
# │ 0 ┆ 0 ┆ 0 │
# │ 0 ┆ 1 ┆ 2 │
# │ 0 ┆ 2 ┆ 4 │
# └─────┴─────┴─────┘
# (result.to_numpy() == np_res).all() = True
Timings
The crossjoin/reshape approach performs quite poorly, especially as N (number of rows) increases. However this is the only approach that supports the LazyFrame. If your data fit into memory, It would likely be best do materialze it and take one of the other approaches.
from contextlib import contextmanager
from time import perf_counter
import numpy as np
from numpy.random import default_rng
import polars as pl
def polars_crossmul(df, cola, colb):
nrows = pl.count().sqrt().cast(pl.Int32)
return (
df.select(cola).join(df.select(colb), how='cross')
.select(computed=cola * colb)
.group_by(pl.arange(0, pl.count()) // nrows, maintain_order=True)
.agg('computed')
.select(pl.col('computed').list.to_struct())
.unnest('computed')
)
def polars_rowmul(df, cola, colb):
return df.select(
(pl.col(cola) * pl.col(colb).get(idx)).alias(str(idx))
for idx in range(len(df))
)
@contextmanager
def timer(msg):
start = perf_counter()
yield
stop = perf_counter()
print(f'{msg: <40} {stop - start:.6f}s')
print(f'{pl.__version__ = }\n{np.__version__ = }', end='\n\n')
for nrows in [100, 1_000, 10_000, 20_000]:
rng = default_rng(0)
df = pl.DataFrame(rng.uniform(0, 100, size=(nrows, 2)), schema=['a', 'b'])
ldf = df.lazy()
print(f'{nrows = }')
with timer('polars crossmul'):
# polars with a crossjoin, multiply, and reshapes to mimic the square
# output of an outer product.
result = polars_crossmul(ldf, pl.col('a'), pl.col('b')).collect()
with timer('polars → numpy → polars'):
# use numpy and convert back to a polars DataFrame afterwards
t = df.to_numpy()
result = pl.DataFrame(
np.outer(t[:, 0], t[:, 1]),
schema={str(i): pl.Float64 for i in range(t.shape[0])}
)
with timer('polars rowmul'):
# note that this does not work with LazyFrame
result = polars_rowmul(df, 'a', 'b')
with timer('polars → numpy'):
# just use numpy no back conversion
t = df.to_numpy()
result = np.outer(t[:, 0], t[:, 1])
print()
Outputs
pl.__version__ = '0.20.3'
np.__version__ = '1.22.4'
nrows = 100
polars crossmul 0.007882s
polars → numpy → polars 0.001929s
polars rowmul 0.001727s
polars → numpy 0.000215s
nrows = 1000
polars crossmul 0.026444s
polars → numpy → polars 0.017812s
polars rowmul 0.024762s
polars → numpy 0.003128s
nrows = 10000
polars crossmul 1.894327s
polars → numpy → polars 0.782990s
polars rowmul 0.355083s
polars → numpy 0.167935s
nrows = 20000
polars crossmul 7.641779s
polars → numpy → polars 3.510021s
polars rowmul 0.735001s
polars → numpy 0.561703s
Lets consider the following data.
Output.
Now, as outlined in my comment above, you could use generator expressions to compute the columns as follows. All expressions will be evaluated in parallel, but I am unsure about whether the generator expression itself will be a bottleneck for large row counts.
Output.
This assumes both vectors come from the same dataframe and, hence, are of the same length.