1

Is there any way to access other columns inside an ibis aggregate statement using across? I created a bunch of boolean columns using mutate across, but I want to sum another column (df. net_spend_amount) if the boolean column (s.matches("is_pre|is_post")) is True.

(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( s.across( selector= s.matches("is_pre|is_post"), func= table.net_spend_amount.count(_)) ) ) ) 

Reproducible example

Create data, add to connection, mutate boolean columns

Create a dataframe and add it to an ibis connection (duckdb here for simplicity, pyspark in my environment). Then I run some mutates to create my boolean columns.

import ibis import datetime from ibis import _ from ibis import selectors as s from pandas import DataFrame from random import randrange ibis.options.interactive = True rows = 100 data = { 'campaign_id': [102793] * rows, 'modality': ['INSTORE', 'ONLINE'] * int(rows / 2), 'post_start_date': [datetime.date(2024, 6, 23)] * rows, 'pre_end_date': [datetime.date(2024, 6, 22)] * rows, 'trn_dt': [datetime.date(2024, 6, 8) + datetime.timedelta(days=randrange(50)) for i in range(rows)], 'net_spend_amt': [float(randrange(100)) - 35 + float(randrange(100))/100 for i in range(rows)] } pdf = DataFrame(data) con = ibis.duckdb.connect() df = con.create_table("test", obj = pdf, overwrite = True) def mutate_is_weeks_pre(pre_end_date_col, trn_date_col, weeks): def inner(df): def is_weeks_pre(pre_end_date, trn_date, week): days = week * 7 pre_start_date = pre_end_date.sub(ibis.interval(days=days)) return (pre_start_date <= trn_date) & (trn_date <= pre_end_date) return df.mutate(**{ f'is_pre{week}w': is_weeks_pre(df[pre_end_date_col], df[trn_date_col], week) for week in weeks }) return inner def mutate_is_weeks_post(post_start_date_col, trn_date_col, weeks): def inner(df): def is_weeks_post(post_start_date, trn_date, weeks): days = weeks * 7 post_end_date = post_start_date.add(ibis.interval(days=days)) return (post_start_date <= trn_date) & (trn_date <= post_end_date) return df.mutate(**{ f'is_post{week}w': is_weeks_post(df[post_start_date_col], df[trn_date_col], week) for week in weeks }) return inner df = (df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) ) 

enter image description here

Run with working across (works)

This is pretty useless because it just counts booleans, but I wanted to make sure I had my across setup correctly

(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( s.across( selector= s.matches("is_pre|is_post"), func=_.count() ) ) ) 

enter image description here

Add sum for net spend (does not work)

I can't figure out a way to do this... I couldn't even get it to run when I used df.net_spend_amt

(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( s.across( selector= s.matches("is_pre|is_post"), func=df.net_spend_amt.sum(where=_) ) ) ) 

Without across (works)

This solution works, I can loop over and create a bunch of expressions and pass them but I'd like to break apart the logic for the boolean columns from the logic to sum the spend and simply use across to loop if it's possible / practical...

(df .pipe(mutate_is_weeks_pre('pre_end_date', 'trn_dt', weeks=[2, 4, 52])) .pipe(mutate_is_weeks_post('post_start_date', 'trn_dt', weeks=[2, 4, 52])) .group_by("campaign_id", "modality") .agg( **{'post+2w+sales': _.net_spend_amt.sum(where=((_.post_start_date <= _.trn_dt) & (_.trn_dt <= (_.post_start_date + datetime.timedelta(days=14))))) } ) ) 

enter image description here

1 Answer 1

0

I figured it out, I needed to use a lambda:

(df .group_by("campaign_id", "modality") .agg( s.across( s.matches("^is_pre|^is_post"), lambda col: ibis.cases( (col, df.net_spend_amt), else_ = ibis.literal(0.0) ).sum(), names=lambda col_name, f: f"{col_name[3:]}_net_spend_amt" ) ) ) 
Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.