import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df df =def make_dfsimilarity_score(df): df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) """ df2 = df2.dropna(axis=0, how="any") Compute the score between each pair of columns in df """ agreements = {} disagreements = {} for col in IT.combinations(df2df,2): fruit1 = df2[col[0]]df[col[0]].values fruit2 = df2[col[1]]df[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) agreements = pd.DataFrame(agreements, index=df2index=df.index) disagreements = pd.DataFrame(disagreements, index=df2index=df.index) numerator = agreements.astype(int)-disagreements.astype(int) grouped = numerator.groupby(level='year') total = grouped.sum() count = grouped.count() similarity score = ((total/count) + 1)/2 return score df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") print(similaritysimilarity_score(df2)) import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") agreements = {} disagreements = {} for col in IT.combinations(df2,2): fruit1 = df2[col[0]].values fruit2 = df2[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) agreements = pd.DataFrame(agreements, index=df2.index) disagreements = pd.DataFrame(disagreements, index=df2.index) numerator = agreements.astype(int)-disagreements.astype(int) grouped = numerator.groupby(level='year') total = grouped.sum() count = grouped.count() similarity = ((total/count) + 1)/2 print(similarity) import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(df): """ Compute the score between each pair of columns in df """ agreements = {} disagreements = {} for col in IT.combinations(df,2): fruit1 = df[col[0]].values fruit2 = df[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) agreements = pd.DataFrame(agreements, index=df.index) disagreements = pd.DataFrame(disagreements, index=df.index) numerator = agreements.astype(int)-disagreements.astype(int) grouped = numerator.groupby(level='year') total = grouped.sum() count = grouped.count() score = ((total/count) + 1)/2 return score df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") print(similarity_score(df2)) and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)
We can also improve performance a bit,get bybetter performance by doing as much work as we canpossible on the original bigwhole DataFrame, df, thenbefore calling df.groupby('year').apply(func..). Also, where we do as little work as weif you can inside express the computation in terms of builtin aggregators such as funcsum. This and (hopefully) minimizes the number of function calls needed to producecount, the desired resultcomputation can be done more quickly than if you use custom functions with groupby/apply.
import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def df similarity_score= make_df(fruit1) df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 fruit2= df2.dropna(axis=0, how="any"): agreements = {} disagreements = agreements=np{} for col in IT.sumcombinations(df2,2): fruit1 = df2[col[0]].values fruit2 = df2[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sumdisagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = dfpd.set_indexDataFrame(['year','fruit'agreements,'manufacturer'])['grade'] index=df2.unstack(['fruit']index) df2disagreements = df2pd.dropnaDataFrame(axis=0disagreements, how="any") def func(df_year): # print(df_yearindex=df2.index) columns=['Source','Target','Weight'] network_df = [] indexnumerator = [] for pair in ITagreements.combinationsastype(df_year,2int): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df-disagreements.append(similarity_scoreastype(fruit1, fruit2)int) grouped = indexnumerator.appendgroupby(pairlevel='year') indextotal = pd.MultiIndexgrouped.from_tuplessum(index) network_dfcount = pdgrouped.Seriescount(network_df, index=index) similarity #= print(network_df(total/count) return network_df result =+ df2.groupby(level='year').apply(func1)/2 print(resultsimilarity) Orange Grape Orange Grape GrapeApple Apple AppleGrape year 1946 0.641900629111 0.650426 0.629111641900 1947 0.633039644388 0.639344 0.644388633039 1948 0.616727613117 0.630566 0.613117616727 1949 0.637786634176 0.635379 0.634176637786 and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)
We can also improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.
import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): # print(df_year) columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) # print(network_df) return network_df result = df2.groupby(level='year').apply(func) print(result) Orange Grape Grape Apple Apple year 1946 0.641900 0.650426 0.629111 1947 0.633039 0.639344 0.644388 1948 0.616727 0.630566 0.613117 1949 0.637786 0.635379 0.634176 and then concatenating the partial results, you can get better performance by doing as much work as possible on the whole DataFrame, df, before calling df.groupby(...). Also, if you can express the computation in terms of builtin aggregators such as sum and count, the computation can be done more quickly than if you use custom functions with groupby/apply.
import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") agreements = {} disagreements = {} for col in IT.combinations(df2,2): fruit1 = df2[col[0]].values fruit2 = df2[col[1]].values agreements[col] = ( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements[col] = ( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) agreements = pd.DataFrame(agreements, index=df2.index) disagreements = pd.DataFrame(disagreements, index=df2.index) numerator = agreements.astype(int)-disagreements.astype(int) grouped = numerator.groupby(level='year') total = grouped.sum() count = grouped.count() similarity = ((total/count) + 1)/2 print(similarity) Grape Orange Apple Apple Grape year 1946 0.629111 0.650426 0.641900 1947 0.644388 0.639344 0.633039 1948 0.613117 0.630566 0.616727 1949 0.634176 0.635379 0.637786 I would rearrange the computation a bit differently. Instead of looping over the years:
for year in range(1946, 2015): partial_result = func(df, year) and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)
We can also improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.
import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): # print(df_year) columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) # print(network_df) return network_df result = df2.groupby(level='year').apply(func) print(result) I would rearrange the computation a bit differently. We can improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.
import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) return network_df result = df2.groupby(level='year').apply(func) print(result) I would rearrange the computation a bit differently. Instead of looping over the years:
for year in range(1946, 2015): partial_result = func(df, year) and then concatenating the partial results, you could define func a bit differently and use df.groupby('year').apply(func). This will call func for each year-group, and concatenate the results for you. If you arrange for func to return Series, then the index of the Series will become the columns of the DataFrame returned by df.groupby('year').apply(func). (Uncommenting the comments below may help make this a bit clearer.)
We can also improve performance a bit, by doing as much work as we can on the original big DataFrame, df, then calling df.groupby('year').apply(func), where we do as little work as we can inside func. This (hopefully) minimizes the number of function calls needed to produce the desired result.
import itertools as IT import numpy as np import pandas as pd np.random.seed(2017) def make_df(): N = 10000 df = pd.DataFrame({'fruit': np.random.choice(['Apple', 'Orange', 'Grape'], size=N), 'grade': np.random.choice([1,2,3], p=[0.7,0.1,0.2], size=N), 'year': np.random.choice(range(1946,1950), size=N)}) df['manufacturer'] = (df['year'].astype(str) + '-' + df.groupby(['year', 'fruit'])['fruit'].cumcount().astype(str)) df = df.sort_values(by=['year']) return df def similarity_score(fruit1, fruit2): agreements=np.sum( ( (fruit1 == 1) & (fruit2 == 1) ) | ( (fruit1 == 3) & (fruit2 == 3) )) disagreements=np.sum( ( (fruit1 == 1) & (fruit2 == 3) ) | ( (fruit1 == 3) & (fruit2 == 1) )) return (( (agreements-disagreements) /float(len(fruit1)) ) +1)/2 df = make_df() df2 = df.set_index(['year','fruit','manufacturer'])['grade'].unstack(['fruit']) df2 = df2.dropna(axis=0, how="any") def func(df_year): # print(df_year) columns=['Source','Target','Weight'] network_df = [] index = [] for pair in IT.combinations(df_year,2): fruit1 = df_year[pair[0]].values fruit2 = df_year[pair[1]].values network_df.append(similarity_score(fruit1, fruit2)) index.append(pair) index = pd.MultiIndex.from_tuples(index) network_df = pd.Series(network_df, index=index) # print(network_df) return network_df result = df2.groupby(level='year').apply(func) print(result)