Can someone help me?
I need two groups to perform linear regression
example:
pdf = pd.DataFrame({'group_id':[1,1,1,2,2,2,3,3,3,3], 'sex':['M','M','F','F','M','F','M','F','F','M'], 'x':[0,1,2,0,1,5,2,3,4,5], 'y':[2,1,0,0,0.5,2.5,3,4,5,6]}) df = sqlContext.createDataFrame(pdf) result_schema =StructType([ StructField('group_id',DoubleType()), StructField('sex',StringType()), StructField('x',DoubleType()) ]) @pandas_udf(result_schema, PandasUDFType.GROUPED_MAP) def ols(df): group_id = df['group_id'].iloc[0] sex = df['sex'].iloc[0] y = df['y'].astype(int) X = df['x'].astype(int) X = sm.add_constant(X) model = sm.OLS(y, X).fit() return pd.DataFrame([[group_id] + [sex] + [model.params[1]]], columns=['group_id'] + ['sex'] + ['x']) beta = df.groupby('group_id', 'sex').apply(ols) beta.show() return error:
PythonException: An exception was thrown from a UDF: 'IndexError: index out of bounds',