Polars DataFrame
The polars package provides a fast and powerful implementation of data frames.
Details
Start a lazy query using a LazyFrame by data.lazy().
Operations on a LazyFrame are not executed until this is requested by either calling collect() or fetch().
Lazy operations are advised because they allow for query optimization and more parallelization.
Code
Create
Create a new DataFrame
| Action |
Code |
Details |
|
From column lists
|
pl.DataFrame({'A': [1, 2], 'fruits': ['banana', 'apple']})
|
|
|
From list of lists
|
data = [[1, 2, 3], [4, 5, 6]]
pl.DataFrame(data, schema=['a', 'b', 'c'])
|
|
|
From dict
|
|
|
|
From dict with schema
|
pl.DataFrame(
dict,
schema = {'col1': pl.Float32, 'col2': pl.Int64, 'col3': pl.Date}
)
|
|
|
Repeat each row with an index between a row-specific range, defined by min (inclusive) and max (exclusive) columns
|
data.select(
pl.col('patient'),
pl.int_ranges(start='startDay', end='endDay').alias('day')
).explode('day')
|
|
From pandas.DataFrame
|
|
|
|
From numpy array
|
data = np.array([[1, 2], [3, 4]])
pl.DataFrame(data, schema = ['a', 'b'], orient = 'col')
|
|
| Action |
Code |
Details |
|
From CSV file
|
|
|
Test
| Action |
Code |
Details |
Is pl.DataFrame or subclass
|
isinstance(x, pl.DataFrame)
|
|
Is pl.DataFrame but not subclass
|
|
|
|
Empty (no rows)
|
|
|
|
Not empty (has rows)
|
|
|
|
Data frames are equal
|
|
|
|
Any column contains null
|
any(col.has_nulls() for col in data)
|
|
|
Any column contains null
|
data.select(pl.all().is_null().any()).to_series().any()
|
|
|
Any column is filled with nulls
|
any(col.is_null().all() for col in data)
|
|
|
Duplicates across two or more columns
|
data.select('col1', 'col2').is_duplicated().any()
|
|
Tests for specific column(s)
| Action |
Code |
Details |
|
Contains column
|
|
|
|
Contains columns
|
{'age', 'sex'}.issubset(data.columns)
|
|
|
Contains columns cols
|
set(cols).issubset(data.columns)
|
|
|
Column is missing
|
'age' not in data.columns
|
|
|
Column has missing value
|
data['sex'].is_null().any()
|
|
|
Column has no missing values
|
data['sex'].is_not_null().all()
|
|
|
Columns are equal
|
|
|
|
Columns are equal series
|
data['sex'].equals(data['sex2'].alias('sex'))
|
Series names must match! |
|
Column has no duplicate values
|
data['date'].is_unique().all()
|
|
|
Column has duplicate values
|
data['date'].is_duplicated().any()
|
|
|
Column has duplicates within groups
|
data.select(
pl.col('date').is_duplicated().any().over('patient').any()
).item()
|
Checks whether any group (patient) has duplicate values (dates) |
|
Column has duplicates within groups
|
data.select(
pl.col('date').is_duplicated().any().over('patient')
).to_series().any()
|
|
|
Column has duplicates within groups
|
data.group_by('patient').agg(
dupe=pl.col('date').is_duplicated().any()
)['dupe'].any()
|
|
|
Column is of type
|
data.schema['col1'] == dtype
|
|
|
Column is bool type
|
data.schema['alive'] == pl.Bool
|
|
|
Column is string type
|
data.schema['sex'] == pl.String
|
|
|
Columns is numeric
|
data.schema['age'].is_numeric()
|
|
|
Column is integer type
|
data.schema['age'].is_integer()
|
|
|
Column is standard integer (64-bit)
|
data.schema['age'] == pl.Int64
|
|
|
Column is float
|
data.schema['age'].is_float()
|
|
|
Column is date
|
data.schema['date'].dtype == pl.Date
|
|
|
Column is list type
|
data.schema['keywords'] == pl.List
|
|
|
Column is of type list of strings
|
data.schema['keywords'] == pl.List(pl.String)
|
|
|
Column is consistently rowwise greater than another column
|
(data['col1'] > data['col2']).all()
|
|
|
Column is sometimes rowwise greater than another column
|
(data['col1'] > data['col2']).any()
|
|
| Action |
Code |
Details |
|
Number of rows
|
|
|
|
Number of columns
|
|
|
|
Get column name as series
|
|
|
|
Column names
|
|
|
|
Column types
|
|
|
|
Column-types mapping
|
|
|
|
Find column index by name
|
data.find_idx_by_name('age')
|
|
|
Get column name as list
|
|
|
|
Get first column as series
|
|
|
|
Get first column as series
|
data.select( pl.first() )
|
|
|
Get the _j_th column as series
|
|
|
|
Get last column as series
|
|
|
|
Number of missing values per column
|
|
|
|
Number of unique values in a column
|
|
|
|
Number of rows without any missing values across columns a, b and c, by group
|
data.group_by('subject').agg([
pl.all_horizontal(
pl.col(['a', 'b', 'c']).is_not_null()
).sum()
])
|
|
|
Number of unique rows across columns
|
|
|
|
Get i th row as tuple
|
|
|
|
Get rows as list of tuple
|
|
? |
|
First item (cell)
|
|
|
|
Item (cell) from row i and column index j
|
|
|
|
Item (cell) from row i and column name name
|
|
|
Update
Update the data frame in-place
| Action |
Code |
Details |
|
Replace column with another series
|
data.replace('age', newAgeSeries)
|
|
|
Remove or pop column
|
data.drop_in_place('Age')
|
Returns the dropped column |
|
Append column from series
|
data.hstack(s, in_place = True)
|
|
|
Insert column from series
|
|
|
Combine
| Action |
Code |
Details |
|
Add rows of a data frame (same columns)
|
|
|
|
Add rows of several data frames (same columns)
|
data.vstack(data2)
data.vstack(dataN)
data.rechunk()
|
|
Derive
Transform the dataframe, preservering shape
| Action |
Code |
Details |
|
Rename column old to new
|
data.rename({'old': 'new'})
|
|
|
Rename columns
|
data.rename({'old1': 'new1', 'old2': 'new2'})
|
|
|
Cast column type
|
data.with_columns(pl.col('col1').cast(pl.Float32))
|
|
|
Cast columns to types
|
data.cast({'col1': pl.Float32, 'col2': pl.UInt8})
|
|
|
Update column values
|
data.with_columns( pl.col('age') + 5 )
|
|
|
Update column values on condition
|
df.with_columns(
pl.when( pl.col('age') >= 18 ).
then( pl.lit(1) ).
otherwise( pl.lit(-1) )
)
|
|
|
Update column values on conditions
|
df.with_columns(
pl.when( pl.col('age') >= 18 ).
then( pl.lit(1) ).
when( pl.col('sex') == 'male' ).
then(4).
otherwise( pl.lit(-1) )
)
|
|
|
Update column values for specific rows
|
rows = [1, 3, 5]
data.with_row_count().with_columns(
pl.when( pl.col('row_nr').is_in(rows) ).
then( pl.lit(True) )
)
|
|
Fill
| Action |
Code |
Details |
|
Fill nulls with zero
|
data.fill_null(strategy = 'zero')
|
|
|
Fill nulls with value
|
|
|
|
Fill nulls with LOCF
|
data.fill_null(strategy='forward')
|
Wrong for grouped data |
|
Fill NaNs with value v
|
|
|
|
Fill infinity with value v for all columns of dtype float
|
import polars.selectors as cs
data.with_columns(
cs.float().replace(float('inf'), v)
)
|
|
Row mask
All snippets output a pl.Series object. Use .to_list() to obtain list output.
| Action |
Code |
Details |
|
Mask indicating whether rows contain null in any column
|
data.select(
pl.any_horizontal(pl.all().is_null())
).to_series()
|
|
|
Mask indicating whether rows contain null in all columns
|
data.select(
pl.all_horizontal(pl.all().is_null())
).to_series()
|
|
|
Mask indicating duplicate rows
|
|
|
|
Mask indicating unique rows
|
|
|
|
Mask based on a column value
|
|
|
|
Mask based on pairwise comparison of another column
|
data.with_columns(
pl.col('income') * 3 > pl.col('rent')
).to_series()
|
|
Over consecutive rows
Aggregate over consecutive rows, e.g., for computing a moving average.
| Action |
Code |
Details |
|
Compute centered rolling mean with window size w
|
data.with_columns(
smoothValue=pl.col('value').
rolling_mean(window_size=w, min_periods=0, center=True)
)
|
|
|
Compute centered rolling mean per group with window size w
|
data.with_columns(
smoothValue=pl.col('value').
rolling_mean(window_size=w, min_periods=0, center=True).
over('patient')
)
|
|
|
Detrend a column by group, using a centered rolling mean with window size w
|
data.with_columns(
trend=pl.col('value').
rolling_mean(window_size=w, min_periods=0, center=True).
over('patient')
).with_columns(
detrendedValue=(pl.col('value') - pl.col('trend')).over('Patient')
)
|
|
|
Detrend a column by group but preserve the mean level, using a centered rolling mean with window size w
|
data.with_columns(
trend=pl.col('value').rolling_mean(
window_size=w, min_periods=0, center=True).over('patient')
).with_columns(
detrendedValue=(
pl.col('value') - pl.col('trend') + pl.col('trend').mean()
).over('Patient')
|
|
Over time
Aggregation is done with respect to a date/time column (e.g., hourly, daily, weekly)
Row-based
Rolling computation for each row (i.e., observation in time)
| Action |
Code |
Details |
|
Rolling computation for a single column, with right-aligned partial window of max n days and at least i rows
|
data.with_columns(
pl.col('Hospitalized').rolling_mean(
by='Date',
closed='both',
window_size=f'{n - 1}d',
min_periods=i
)
)
|
Currently only available for rolling_mean(), not any of the other rolling_ functions. |
|
Rolling computation with right-aligned partial window of max n days
|
new_data = data.rolling(
index_column='Date',
period=f'{n}d'
).agg(
pl.mean('Obs)
)
|
|
|
Rolling computation with right-aligned partial windows of max n days, keep other columns
|
new_data = data.rolling(
index_column='Date',
period=f'{n}d'
).agg(
pl.exclude(['Date', 'Obs']).last(),
pl.mean('Obs)
)
|
|
Interval-based
Rolling computation for constant (not dynamic!) intervals. This may introduce additional moments in time.
| Action |
Code |
Details |
|
Daily rolling computation with right-aligned partial window of max n days
|
new_data = data.group_by_dynamic(
index_column='Date',
every='1d',
offset=f'-{n - 1}d',
period=f'{n - 1}d',
label='right',
closed='both',
start_by='window'
).agg(
pl.mean('Obs')
)
|
To be verified |
|
Dynamic daily rolling statistic proportional to number of window observations, with right-aligned partial window of max n days
|
new_data = data.group_by_dynamic(
index_column='Date',
every='1d',
offset=f'-{n - 1}d',
period=f'{n - 1}d',
label='right',
closed='both',
start_by='window'
).agg(
pl.sum('Hospitalized').alias('DaysHospitalized'),
pl.count()
).with_columns(
Proportion=pl.col('DaysHospitalized') / pl.col('count')
)
|
To be verified |
|
Daily rolling computation with left-aligned partial window of max n days
|
new_data = data.group_by_dynamic(
index_column='Date',
every='1d',
offset=f'-{n - 1}d',
period=f'{n - 1}d',
label='left',
closed='both',
start_by='datapoint'
).agg(
pl.mean('Obs')
)
|
To be verified |
Order
| Action |
Code |
Details |
|
Reorder all columns in the given order
|
data.select(pl.col(['patient', 'sex', 'age']))
|
|
|
Reorder specific columns in a given order
|
cols = ['topic', 'parent', 'language']
data.select(pl.col(cols), pl.exclude(cols))
|
There doesn't seem to be a shorter way to do this |
|
Sort rows by column value
|
|
|
Grow
Transformations which increase the shape of the data frame
| Action |
Code |
Details |
|
Append constant numeric column
|
data.with_columns(Intercept=pl.lit(1))
|
|
|
Append series as new column
|
s = pl.Series("apple", [10, 20, 30])
data.hstack([s])
|
Note the brackets |
Add derived column(s)
| Action |
Code |
Details |
|
Transform another column
|
data.with_columns( ageSq = pl.col('age') ** 2 )
|
|
|
Transform another column
|
data.with_columns(
(pl.col('age') ** 2).alias('ageSq')
)
|
|
|
Constant values conditional on another column
|
data.with_columns(
AdultScore=pl.when(pl.col('age') >= 18).
then(pl.lit(1)).
otherwise(pl.lit(-1))
)
|
|
|
Add multiple columns based on transformations of other columns
|
data.with_columns(
ageSq=pl.col('age') ** 2
hospitalizationsSq=pl.col('hospitalizations') ** 2
)
|
|
|
Add multiple columns based on transformations of other columns
|
data.with_columns(
(pl.col('Age') ** 2).alias('Age2')
pl.col('Age').alias('Age3') ** 3
)
|
alias() can be called after pl.col(), may be more readable sometimes |
|
Add multiple constant columns based on a dict
|
for k, v in dict.items():
data = data.with_columns(pl.lit(v).alias(k))
|
|
Integer column derivations
| Action |
Code |
Details |
|
Map integer values to strings, as categorical column
|
map = {1: 'first', 2: 'second', 3: 'third'}
data.with_columns(
rank=pl.col('place').replace_strict(map).cast(pl.Categorical)
)
|
|
|
Timestamp (in seconds) since Unix epoch (1970-01-01), as datetime column
|
data.with_columns(
date=pl.from_epoch('timestamp')
)
|
|
|
Milliseconds timestamp since Unix epoch, as datetime column
|
data.with_columns(
date=pl.from_epoch('timestamp_ms', time_unit='ms')
)
|
|
|
Total days since Unix epoch, as date column
|
data.with_columns(
date=pl.from_epoch('daystamp', time_unit='d')
)
|
|
|
Total days since Unix epoch, as date column
|
data.with_columns(
date=pl.col('daystamp').cast(pl.Date)
)
|
|
|
Number of days since reference date, as date column
|
from datetime import date
ref_days = (x - date.fromtimestamp(0)).days
data.with_columns(
date=pl.col('day').cast(pl.Date) + pl.duration(days=ref_days)
)
|
|
|
Number of days since reference date, as date column
|
from datetime import date
ref_days = (x - date.fromtimestamp(0)).days
data.with_columns(
date=(pl.col('day') + ref_days).cast(pl.Date)
)
|
|
String column derivations
| Action |
Code |
Details |
|
Parse string column to date
|
data.with_columns(
date=pl.col('datestring').str.to_date()
)
|
|
|
Parse string column to date with known format
|
data.with_columns(
date = pl.col('datestring').str.to_date('%Y-%m-%d')
)
|
|
Datetime column derivations
| Action |
Code |
Details |
|
Weekday from date column
|
|
|
|
Month from date column
|
data.with_columns(
month=pl.col('date').dt.month()
)
|
|
|
Year from date column
|
data.with_columns(
year=pl.col('Date').dt.year()
)
|
|
|
Elapsed days between date ranges, by group
|
data.with_columns(
elapsedDays=(
pl.col('Date') - pl.col('Date').min().over('Subject')
).dt.total_days().add(1)
)
|
|
Add rows
| Action |
Code |
Details |
|
Add row as tuple
|
|
|
|
Add list of tuples
|
|
|
|
Add data frame
|
|
|
Shrink the data frame
Remove one or more columns
| Action |
Code |
Details |
|
Remove single column
|
|
|
|
Remove columns
|
data.drop(['Age', 'Sex'])
|
|
|
Remove columns
|
data.select(pl.exclude(['Age', 'Sex']))
|
|
|
Select single column
|
|
Verbose |
|
Select single column
|
data.select(pl.col('Age'))
|
|
|
Multiple columns
|
data.select('col1', 'col2')
|
|
|
Multiple columns, specified by list
|
data.select(['col1', 'col2'])
|
|
|
Multiple columns, specified by list
|
data.select(pl.col(['Age', 'Sex'])
|
|
|
Remove all numeric columns
|
|
|
Remove one or more rows
| Action |
Code |
Details |
|
Remove all rows (clear)
|
|
|
|
Limit query to first n rows
|
|
|
|
Limit query to last n rows
|
|
|
Keep rows
| Action |
Code |
Details |
|
Select i th row
|
|
|
|
Select i th last row
|
|
|
|
Select rows by list of row numbers
|
|
|
|
First n rows (head)
|
|
|
|
Last n rows (tail)
|
|
|
|
Slice of rows from a to b
|
|
|
|
Slice of rows from a to b
|
|
|
|
Keep rows conditionally on column
|
data.filter(pl.col('age') >= 18)
|
|
|
Keep rows based on multiple column conditions
|
data.filter(
(pl.col('age') >= 18) & (pl.col('sex') == 'male')
)
|
|
|
Sample at most n rows
|
|
|
|
Sample n rows, with replacement
|
data.sample(n, with_replacement=True)
|
Replicates some rows. The result is guaranteed to have n rows, even when the input contains fewer rows. |
|
Sample n rows per group, with replacement
|
data.group_by('Patient').map_groups(
lambda df: df.sample(n, with_replacement=True)
)
|
Very slow due to the map_groups() call. |
|
Sample at most n rows per group
|
data.filter(
pl.int_range(pl.len()).shuffle().over('patient') < n
)
|
Groups with fewer than n rows keep all rows. |
Exclude rows
| Action |
Code |
Details |
|
Remove rows by row numbers
|
data.with_row_index().filter(
~pl.col('index').is_in(rows)
)
|
Output contains additional column 'Index' |
|
Remove all rows except the given row numbers
|
|
|
|
Remove all rows except the given row numbers
|
data.filter(pl.arange(0, pl.count()).is_in([1, 5, 7]))
|
|
|
Remove rows that contain null values
|
|
|
|
Remove rows that contain null values in certain columns
|
data.drop_nulls(['fruits', 'cars'])
|
|
Aggregate rows
By group
| Action |
Code |
Details |
|
Mean of column by group
|
data.group_by('sex').agg(pl.col('age').mean())
|
|
Reshape
| Action |
Code |
Details |
|
From wide to long format
|
data.melt(id_vars='sex', value_vars=['a', 'b'])
|
|
|
To narrow format
|
|
? |
Combine
| Action |
Code |
Details |
|
Concatenate rows of dataframes
|
pl.concat([df, df2, dfN])
|
|
|
Concatenate rows of dataframes, having partially overlapping columns
|
pl.concat([df, df2, dfN], how='diagonal')
|
|
|
Merge two data frames on the sorted key
|
|
|
|
Inner join
|
data.join(data2, on=['sex', 'country'])
|
|
|
Left join
|
data.join(data2, on=['sex', 'country'], how='left')
|
|
|
Right join
|
data.join(data2, on=['sex', 'country'], how='right')
|
|
|
Outer (full) join
|
data.join(data2, on=['sex', 'country'], how='full')
|
|
|
Cross join
|
data.join(data2, on=['sex', 'country'], how='cross')
|
|
|
Semi join (one match per index)
|
data.join(data2, on=['sex', 'country'], how='semi')
|
|
|
Left anti join (exclude matches from table 2)
|
data.join(data2, on=['sex', 'country'], how='anti')
|
|
Convert
| Action |
Code |
Details |
|
To series (first column only)
|
|
|
To polars.LazyFrame
|
|
|
To pandas.DataFrame
|
|
|
|
To list of series
|
|
|
|
Split into list of data frames based on column
|
|
|
|
Split into list of data frames based on column tuples
|
data.partition_by('sex', 'country')
|
|
|
Split into dict of data frames based on column(s)
|
data.partition_by('sex', 'country', as_dict = True)
|
|
|
To CSV file
|
data.write_csv('derp.csv')
|
|
|
To Parquet file
|
data.write_parquet('derp.parquet')
|
|
|
To Arrow IPC or Feather
|
data.write_ipc('file.arrow')
|
|
|
To JSON
|
|
|
|
Write to clipboard, in TSV format
|
|
|
Show