Revisions to How to flatten individual pandas dataframes and stack them to achieve a new one?

added 206 characters in body

edited Aug 21, 2017 at 14:22

886.2k
197
1.9k
1.7k

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df   df =def make_dfsimilarity_score(df): df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) """ df2 = df2.dropna(axis=0, how="any") Compute the score between each pair of columns in df  """  agreements = {}  disagreements = {}  for col in IT.combinations(df2df,2):   fruit1 = df2[col[0]]df[col[0]].values   fruit2 = df2[col[1]]df[col[1]].values   agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) )   | ( (fruit1 == 3) & (fruit2 == 3) ))   disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) )   | ( (fruit1 == 3) & (fruit2 == 1) ))  agreements = pd.DataFrame(agreements, index=df2index=df.index)  disagreements = pd.DataFrame(disagreements, index=df2index=df.index)  numerator = agreements.astype(int)-disagreements.astype(int)  grouped = numerator.groupby(level='year')  total = grouped.sum()  count = grouped.count() similarity  score = ((total/count) + 1)/2 return score  df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") print(similaritysimilarity_score(df2))

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df   df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") agreements = {} disagreements = {} for col in IT.combinations(df2,2): fruit1 = df2[col[0]].values fruit2 = df2[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) agreements = pd.DataFrame(agreements, index=df2.index) disagreements = pd.DataFrame(disagreements, index=df2.index) numerator = agreements.astype(int)-disagreements.astype(int) grouped = numerator.groupby(level='year') total = grouped.sum() count = grouped.count() similarity = ((total/count) + 1)/2 print(similarity)

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(df):  """  Compute the score between each pair of columns in df  """  agreements = {}  disagreements = {}  for col in IT.combinations(df,2):   fruit1 = df[col[0]].values   fruit2 = df[col[1]].values   agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) )   | ( (fruit1 == 3) & (fruit2 == 3) ))   disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) )   | ( (fruit1 == 3) & (fruit2 == 1) ))  agreements = pd.DataFrame(agreements, index=df.index)  disagreements = pd.DataFrame(disagreements, index=df.index)  numerator = agreements.astype(int)-disagreements.astype(int)  grouped = numerator.groupby(level='year')  total = grouped.sum()  count = grouped.count()   score = ((total/count) + 1)/2 return score  df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") print(similarity_score(df2))

deleted 600 characters in body

Source Link

edited Aug 20, 2017 at 10:41

unutbu

886.2k
197
1.9k
1.7k

and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)

We can also improve performance a bit,get bybetter performance by doing as much work as we canpossible on the original bigwhole DataFrame, df, thenbefore calling df.groupby('year').apply(func..). Also, where we do as little work as weif you can inside express the computation in terms of builtin aggregators such as funcsum. This and (hopefully) minimizes the number of function calls needed to producecount, the desired resultcomputation can be done more quickly than if you use custom functions with groupby/apply.

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def df similarity_score= make_df(fruit1) df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 fruit2= df2.dropna(axis=0, how="any"):   agreements = {} disagreements = agreements=np{} for col in IT.sumcombinations(df2,2): fruit1 = df2[col[0]].values fruit2 = df2[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) ))   disagreements=np.sumdisagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) )   | ( (fruit1 == 3) & (fruit2 == 1) ))    return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = dfpd.set_indexDataFrame(['year','fruit'agreements,'manufacturer'])['grade'] index=df2.unstack(['fruit']index) df2disagreements = df2pd.dropnaDataFrame(axis=0disagreements, how="any") def func(df_year):  # print(df_yearindex=df2.index)  columns=['Source','Target','Weight'] network_df = [] indexnumerator = [] for pair in ITagreements.combinationsastype(df_year,2int): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df-disagreements.append(similarity_scoreastype(fruit1, fruit2)int)  grouped = indexnumerator.appendgroupby(pairlevel='year')  indextotal = pd.MultiIndexgrouped.from_tuplessum(index)  network_dfcount = pdgrouped.Seriescount(network_df, index=index)  similarity #= print(network_df(total/count)  return network_df result =+ df2.groupby(level='year').apply(func1)/2 print(resultsimilarity)

 Orange Grape Orange Grape GrapeApple Apple AppleGrape year 1946 0.641900629111 0.650426 0.629111641900 1947 0.633039644388 0.639344 0.644388633039 1948 0.616727613117 0.630566 0.613117616727 1949 0.637786634176 0.635379 0.634176637786

and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)

We can also improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) ))   disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) ))    return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year):  # print(df_year)  columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2))   index.append(pair)  index = pd.MultiIndex.from_tuples(index)  network_df = pd.Series(network_df, index=index)   # print(network_df)  return network_df result = df2.groupby(level='year').apply(func) print(result)

 Orange Grape Grape Apple Apple year 1946 0.641900 0.650426 0.629111 1947 0.633039 0.639344 0.644388 1948 0.616727 0.630566 0.613117 1949 0.637786 0.635379 0.634176

and then concatenating the partial results, you can get better performance by doing as much work as possible on the whole DataFrame, df, before calling df.groupby(...). Also, if you can express the computation in terms of builtin aggregators such as sum and count, the computation can be done more quickly than if you use custom functions with groupby/apply.

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df  df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any")   agreements = {} disagreements = {} for col in IT.combinations(df2,2): fruit1 = df2[col[0]].values fruit2 = df2[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) )   | ( (fruit1 == 3) & (fruit2 == 1) )) agreements = pd.DataFrame(agreements, index=df2.index) disagreements = pd.DataFrame(disagreements, index=df2.index) numerator = agreements.astype(int)-disagreements.astype(int) grouped = numerator.groupby(level='year') total = grouped.sum() count = grouped.count() similarity = ((total/count) + 1)/2 print(similarity)

 Grape Orange Apple Apple Grape year 1946 0.629111 0.650426 0.641900 1947 0.644388 0.639344 0.633039 1948 0.613117 0.630566 0.616727 1949 0.634176 0.635379 0.637786

added 556 characters in body

Source Link

edited Aug 20, 2017 at 2:28

unutbu

886.2k
197
1.9k
1.7k

I would rearrange the computation a bit differently. Instead of looping over the years:

for year in range(1946, 2015): partial_result = func(df, year)

and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)

We can also improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): # print(df_year) columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) # print(network_df) return network_df   result = df2.groupby(level='year').apply(func) print(result)

I would rearrange the computation a bit differently. We can improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) return network_df result = df2.groupby(level='year').apply(func) print(result)

I would rearrange the computation a bit differently. Instead of looping over the years:

for year in range(1946, 2015): partial_result = func(df, year)

and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)

We can also improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.

import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): # print(df_year) columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) # print(network_df) return network_df   result = df2.groupby(level='year').apply(func) print(result)

deleted 922 characters in body

Source Link

edited Aug 20, 2017 at 2:13

unutbu

886.2k
197
1.9k
1.7k

Loading

Source Link

answered Aug 19, 2017 at 21:23

unutbu

886.2k
197
1.9k
1.7k

Loading

Collectives™ on Stack Overflow

Return to Answer