Is there any way to access other columns inside an ibis aggregate statement using across? I created a bunch of boolean columns using mutate across, but I want to sum another column (df. net_spend_amount) if the boolean column (s.matches("is_pre|is_post")) is True.
(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( s.across( selector= s.matches("is_pre|is_post"), func= table.net_spend_amount.count(_)) ) ) ) Reproducible example
Create data, add to connection, mutate boolean columns
Create a dataframe and add it to an ibis connection (duckdb here for simplicity, pyspark in my environment). Then I run some mutates to create my boolean columns.
import ibis import datetime from ibis import _ from ibis import selectors as s from pandas import DataFrame from random import randrange ibis.options.interactive = True rows = 100 data = { 'campaign_id': [102793] * rows, 'modality': ['INSTORE', 'ONLINE'] * int(rows / 2), 'post_start_date': [datetime.date(2024, 6, 23)] * rows, 'pre_end_date': [datetime.date(2024, 6, 22)] * rows, 'trn_dt': [datetime.date(2024, 6, 8) + datetime.timedelta(days=randrange(50)) for i in range(rows)], 'net_spend_amt': [float(randrange(100)) - 35 + float(randrange(100))/100 for i in range(rows)] } pdf = DataFrame(data) con = ibis.duckdb.connect() df = con.create_table("test", obj = pdf, overwrite = True) def mutate_is_weeks_pre(pre_end_date_col, trn_date_col, weeks): def inner(df): def is_weeks_pre(pre_end_date, trn_date, week): days = week * 7 pre_start_date = pre_end_date.sub(ibis.interval(days=days)) return (pre_start_date <= trn_date) & (trn_date <= pre_end_date) return df.mutate(**{ f'is_pre{week}w': is_weeks_pre(df[pre_end_date_col], df[trn_date_col], week) for week in weeks }) return inner def mutate_is_weeks_post(post_start_date_col, trn_date_col, weeks): def inner(df): def is_weeks_post(post_start_date, trn_date, weeks): days = weeks * 7 post_end_date = post_start_date.add(ibis.interval(days=days)) return (post_start_date <= trn_date) & (trn_date <= post_end_date) return df.mutate(**{ f'is_post{week}w': is_weeks_post(df[post_start_date_col], df[trn_date_col], week) for week in weeks }) return inner df = (df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) ) Run with working across (works)
This is pretty useless because it just counts booleans, but I wanted to make sure I had my across setup correctly
(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( s.across( selector= s.matches("is_pre|is_post"), func=_.count() ) ) ) Add sum for net spend (does not work)
I can't figure out a way to do this... I couldn't even get it to run when I used df.net_spend_amt
(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( s.across( selector= s.matches("is_pre|is_post"), func=df.net_spend_amt.sum(where=_) ) ) ) Without across (works)
This solution works, I can loop over and create a bunch of expressions and pass them but I'd like to break apart the logic for the boolean columns from the logic to sum the spend and simply use across to loop if it's possible / practical...
(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( **{'post+2w+sales': _.net_spend_amt.sum(where=((_.post_start_date <= _.trn_dt) & (_.trn_dt <= (_.post_start_date + datetime.timedelta(days=14))))) } ) ) 

