Polars equivalent to np.outer()

67 Views Asked by At

Is there a polars equivalent to np.outer()? np.outer() returns the outer product of two vectors. I cannot find an expressive way to represent this in Polars.

2

There are 2 best solutions below

0
Hericks On BEST ANSWER

Lets consider the following data.

import polars as pl

df = pl.DataFrame({
    "a": [0, 1, 2],
    "b": [1, 2, 3],
})
df

Output.

shape: (3, 2)
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 0   ┆ 1   │
│ 1   ┆ 2   │
│ 2   ┆ 3   │
└─────┴─────┘

Now, as outlined in my comment above, you could use generator expressions to compute the columns as follows. All expressions will be evaluated in parallel, but I am unsure about whether the generator expression itself will be a bottleneck for large row counts.

df.with_columns(
    (pl.col("a") * pl.col("b").get(idx)).alias(f"outer{idx}")
    for idx in range(len(df))
)

Output.

shape: (3, 5)
┌─────┬─────┬────────┬────────┬────────┐
│ a   ┆ b   ┆ outer0 ┆ outer1 ┆ outer2 │
│ --- ┆ --- ┆ ---    ┆ ---    ┆ ---    │
│ i64 ┆ i64 ┆ i64    ┆ i64    ┆ i64    │
╞═════╪═════╪════════╪════════╪════════╡
│ 0   ┆ 1   ┆ 0      ┆ 0      ┆ 0      │
│ 1   ┆ 2   ┆ 1      ┆ 2      ┆ 3      │
│ 2   ┆ 3   ┆ 2      ┆ 4      ┆ 6      │
└─────┴─────┴────────┴────────┴────────┘

This assumes both vectors come from the same dataframe and, hence, are of the same length.

0
Cameron Riddell On

As was suggested in a comment, here is an implementation using a cross join. The approach taken here works with polars.LazyFrames (and should therefore work with polars.DataFrames as well) I’ve also included @Henricks solution for verification and for timing (see below)

import numpy as np
import polars as pl

def crossjoin_mul(df, cola, colb):
    nrows = pl.count().sqrt().cast(pl.Int32)
    return (
        df.select(cola).join(df.select(colb), how='cross')
        .select(computed=cola * colb)
        .group_by(pl.arange(0, pl.count()) // nrows, maintain_order=True)
        .agg('computed')
        .select(pl.col('computed').list.to_struct())
        .unnest('computed')
    )

def row_expr_mul(df, cola, colb):
    '''
    This will not work on a polars.LazyFrame since the number of expressions
    generated is dependent on the number of rows in the DataFrame.
    '''
    return df.select(
        (pl.col(cola) * pl.col(colb).get(idx)).alias(str(idx))
        for idx in range(len(df))
    )

df = pl.DataFrame({
    "a": [0, 1, 2],
    "b": [0, 1, 2],
})
ldf = df.lazy()

print(
    (np_res := np.outer(df['a'], df['b'])),

    (result := ldf.pipe(crossjoin_mul, pl.col('a'), pl.col('b')).collect()),
    f'{(result.to_numpy() == np_res).all() = }\n',

    (result := df.pipe(row_expr_mul, 'a', 'b')),
    f'{(result.to_numpy() == np_res).all() = }\n',

    sep='\n',
)
# [[0 0 0]
#  [0 1 2]
#  [0 2 4]]
#
# shape: (3, 3)
# ┌─────────┬─────────┬─────────┐
# │ field_0 ┆ field_1 ┆ field_2 │
# │ ---     ┆ ---     ┆ ---     │
# │ i64     ┆ i64     ┆ i64     │
# ╞═════════╪═════════╪═════════╡
# │ 0       ┆ 0       ┆ 0       │
# │ 0       ┆ 1       ┆ 2       │
# │ 0       ┆ 2       ┆ 4       │
# └─────────┴─────────┴─────────┘
# (result.to_numpy() == np_res).all() = True

# shape: (3, 3)
# ┌─────┬─────┬─────┐
# │ 0   ┆ 1   ┆ 2   │
# │ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╡
# │ 0   ┆ 0   ┆ 0   │
# │ 0   ┆ 1   ┆ 2   │
# │ 0   ┆ 2   ┆ 4   │
# └─────┴─────┴─────┘
# (result.to_numpy() == np_res).all() = True

Timings

The crossjoin/reshape approach performs quite poorly, especially as N (number of rows) increases. However this is the only approach that supports the LazyFrame. If your data fit into memory, It would likely be best do materialze it and take one of the other approaches.

from contextlib import contextmanager
from time import perf_counter

import numpy as np
from numpy.random import default_rng
import polars as pl

def polars_crossmul(df, cola, colb):
    nrows = pl.count().sqrt().cast(pl.Int32)

    return (
        df.select(cola).join(df.select(colb), how='cross')
        .select(computed=cola * colb)
        .group_by(pl.arange(0, pl.count()) // nrows, maintain_order=True)
        .agg('computed')
        .select(pl.col('computed').list.to_struct())
        .unnest('computed')
    )

def polars_rowmul(df, cola, colb):
    return df.select(
        (pl.col(cola) * pl.col(colb).get(idx)).alias(str(idx))
        for idx in range(len(df))
    )

@contextmanager
def timer(msg):
    start = perf_counter()
    yield
    stop = perf_counter()
    print(f'{msg: <40} {stop - start:.6f}s')

print(f'{pl.__version__ = }\n{np.__version__ = }', end='\n\n')

for nrows in [100, 1_000, 10_000, 20_000]:
    rng = default_rng(0)

    df = pl.DataFrame(rng.uniform(0, 100, size=(nrows, 2)), schema=['a', 'b'])
    ldf = df.lazy()

    print(f'{nrows = }')
    with timer('polars crossmul'):
        # polars with a crossjoin, multiply, and reshapes to mimic the square
        #  output of an outer product.
        result = polars_crossmul(ldf, pl.col('a'), pl.col('b')).collect()

    with timer('polars → numpy → polars'):
        # use numpy and convert back to a polars DataFrame afterwards
        t = df.to_numpy()
        result = pl.DataFrame(
            np.outer(t[:, 0], t[:, 1]),
            schema={str(i): pl.Float64 for i in range(t.shape[0])}
        )

    with timer('polars rowmul'):
        # note that this does not work with LazyFrame
        result = polars_rowmul(df, 'a', 'b')


    with timer('polars → numpy'):
        # just use numpy no back conversion
        t = df.to_numpy()
        result = np.outer(t[:, 0], t[:, 1])

    print()

Outputs

pl.__version__ = '0.20.3'
np.__version__ = '1.22.4'

nrows = 100
polars crossmul                          0.007882s
polars → numpy → polars                  0.001929s
polars rowmul                            0.001727s
polars → numpy                           0.000215s

nrows = 1000
polars crossmul                          0.026444s
polars → numpy → polars                  0.017812s
polars rowmul                            0.024762s
polars → numpy                           0.003128s

nrows = 10000
polars crossmul                          1.894327s
polars → numpy → polars                  0.782990s
polars rowmul                            0.355083s
polars → numpy                           0.167935s

nrows = 20000
polars crossmul                          7.641779s
polars → numpy → polars                  3.510021s
polars rowmul                            0.735001s
polars → numpy                           0.561703s