How to use natsort in polars

75 Views Asked by At

I can't get the code below to work.how should i solve it?

import polars as pl
from natsort import index_natsorted, natsorted
data = pl.DataFrame([["a1","a10","a2"], [4,5,6]],)
data.slice(index_natsorted(data[["column_0"]]))
1

There are 1 best solutions below

0
jqurious On

It looks like the basic form of https://pypi.org/project/natsort/ is equivalent to something along the lines of:

def natsort(df, expr):
    expr = expr.str.extract_all(r"\d+|\D+")
    
    df = (
       df.with_row_count()
         .with_columns(
            _str = expr.list.to_struct("max_width"),
            _num = expr.cast(pl.List(int)).list.to_struct("max_width")
         )
    )

    nums = df.select("_num").unnest("_num").with_row_count()
    nulls = (col.name for col in nums if col.is_null().all())
    
    strs = df.select("_str").unnest("_str").select(nulls)
    
    order = nums.update(strs).sort(pl.exclude("row_nr")).select("row_nr")
    
    return order.join(df.drop("_str", "_num"), on="row_nr", how="left").drop("row_nr")

Example usage:

df = pl.DataFrame([["c3", "a1b10", "a10", "b2", "a4"]]).with_row_count("id")

natsort(df, pl.col("column_0"))
shape: (5, 2)
┌─────┬──────────┐
│ id  ┆ column_0 │
│ --- ┆ ---      │
│ u32 ┆ str      │
╞═════╪══════════╡
│ 1   ┆ a1b10    │
│ 4   ┆ a4       │
│ 2   ┆ a10      │
│ 3   ┆ b2       │
│ 0   ┆ c3       │
└─────┴──────────┘

What's happening is you split up the sequences of digits/non-digits:

expr = pl.col("column_0").str.extract_all(r"\d+|\D+")

df.with_columns(
   _str = expr.list.to_struct("max_width"),
   _num = expr.cast(pl.List(int)).list.to_struct("max_width")
)
shape: (5, 4)
┌─────┬──────────┬──────────────────────┬─────────────────────┐
│ id  ┆ column_0 ┆ _str                 ┆ _num                │
│ --- ┆ ---      ┆ ---                  ┆ ---                 │
│ u32 ┆ str      ┆ struct[4]            ┆ struct[4]           │
╞═════╪══════════╪══════════════════════╪═════════════════════╡
│ 0   ┆ c3       ┆ {"c","3",null,null}  ┆ {null,3,null,null}  │
│ 1   ┆ a1b10    ┆ {"a","1","b","10"}   ┆ {null,1,null,10}    │
│ 2   ┆ a10      ┆ {"a","10",null,null} ┆ {null,10,null,null} │
│ 3   ┆ b2       ┆ {"b","2",null,null}  ┆ {null,2,null,null}  │
│ 4   ┆ a4       ┆ {"a","4",null,null}  ┆ {null,4,null,null}  │
└─────┴──────────┴──────────────────────┴─────────────────────┘

And replace the "stringified" numbers with the integer values:

shape: (5, 1)
┌──────────────────────┐
│ col                  │
│ ---                  │
│ struct[4]            │
╞══════════════════════╡
│ {"c",3,null,null}    │
│ {"a",1,"b",10}       │
│ {"a",10,null,null}   │
│ {"b",2,null,null}    │
│ {"a",4,null,null}    │
└──────────────────────┘

Which you can then .sort() by.