Return to Answer

added 3700 characters in body

edited Aug 19, 2023 at 15:49

71.2k
5
76
257

Here is an example demonstrating some of the logic that you wrote for concentrations and errors, with Pandas:

import numpy as np import pandas as pd MAX_VARS = 20 MW = 42_000 def load(filename: str = 'scaled_uncertanty_fits.txt') -> pd.DataFrame: df = pd.read_csv( filename, sep=',|±', skiprows=1, # skip the original headers - they aren't wide enough names=[ 'Sample', 'RedChi2', 'DoF', *range(3*MAX_VARS), ] ) df.index.name = 'csv_index' # Forward-fill the dataset name (e.g. WT_2017) is_dataset = df.iloc[:, 1].isna() df.insert(loc=0, column='dataset', value=df.loc[is_dataset, 'Sample']) df['dataset'].ffill(inplace=True) return df[~is_dataset] def normalize_vars(df: pd.DataFrame) -> pd.DataFrame: """Normalize variable-solution-uncertainty triples""" var_offset = 4 var_cols = df.iloc[:, var_offset:] meta_names = pd.Index(name='varmeta', data=['Variable', 'Solution', 'Uncertainty']) rectangular = pd.DataFrame( index=df.index, columns=pd.MultiIndex.from_product(( meta_names, pd.RangeIndex(name='varno', start=0, stop=MAX_VARS), )) ) n_vars = var_cols.notna().sum(axis=1)//3 for row_vars, group in var_cols.groupby(n_vars): source = group.iloc[:, :row_vars] source.columns = pd.MultiIndex.from_product( (('Variable',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source source = group.iloc[:, row_vars: row_vars*2] source.columns = pd.MultiIndex.from_product( (('Solution',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source source = group.iloc[:, row_vars*2: row_vars*3] source.columns = pd.MultiIndex.from_product( (('Uncertainty',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source long = rectangular.stack(level='varno') normalized = ( pd.merge(left=df[['dataset', 'Sample']], right=long[meta_names], on='csv_index') .set_index(['dataset', 'Sample', 'Variable'], append=True) .astype({'Solution': float, 'Uncertainty': float}) .unstack('Variable') ) return normalized def extract_factors(df: pd.DataFrame): names = df.columns[df.columns.get_loc('Solution')].droplevel(0) values = ( names.get_level_values('Variable') .to_series(name='factor', index=names) .str.replace('_', '.') .str.extract(r'(\d+\.\d+)$', expand=False) .dropna() .astype(float) ) return values def main() -> None: df = load() df = normalize_vars(df) # For all datasets and Sample ~ Open-Closed* open_closed = df[df.index.get_level_values('Sample').str.contains('Open-Closed')] # io is converted concentrations from the number embedded in the scaling_factornnn names factors = extract_factors(df) io = conc = factors * 1e9/MW k = open_closed.loc[:, ('Solution', 'k')] k_error = open_closed.loc[:, ('Uncertainty', 'k')] fo = k/(k + 1) fc = 1/(k + 1) fm = 0 h = 1e-8 kh = k + h dk_fo = kh/(kh + 1) dk_fc = 1/(kh + 1) dk_fm = 0 open_errors = k_error**2 * dk_fo**2 closed_errors = k_error**2 * dk_fc**2 monomer_errors = k_error**2 * dk_fm**2 if __name__ == '__main__': main()

Here is an example demonstrating some of the logic that you wrote for concentrations and errors, with Pandas:

import numpy as np import pandas as pd MAX_VARS = 20 MW = 42_000 def load(filename: str = 'scaled_uncertanty_fits.txt') -> pd.DataFrame: df = pd.read_csv( filename, sep=',|±', skiprows=1, # skip the original headers - they aren't wide enough names=[ 'Sample', 'RedChi2', 'DoF', *range(3*MAX_VARS), ] ) df.index.name = 'csv_index' # Forward-fill the dataset name (e.g. WT_2017) is_dataset = df.iloc[:, 1].isna() df.insert(loc=0, column='dataset', value=df.loc[is_dataset, 'Sample']) df['dataset'].ffill(inplace=True) return df[~is_dataset] def normalize_vars(df: pd.DataFrame) -> pd.DataFrame: """Normalize variable-solution-uncertainty triples""" var_offset = 4 var_cols = df.iloc[:, var_offset:] meta_names = pd.Index(name='varmeta', data=['Variable', 'Solution', 'Uncertainty']) rectangular = pd.DataFrame( index=df.index, columns=pd.MultiIndex.from_product(( meta_names, pd.RangeIndex(name='varno', start=0, stop=MAX_VARS), )) ) n_vars = var_cols.notna().sum(axis=1)//3 for row_vars, group in var_cols.groupby(n_vars): source = group.iloc[:, :row_vars] source.columns = pd.MultiIndex.from_product( (('Variable',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source source = group.iloc[:, row_vars: row_vars*2] source.columns = pd.MultiIndex.from_product( (('Solution',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source source = group.iloc[:, row_vars*2: row_vars*3] source.columns = pd.MultiIndex.from_product( (('Uncertainty',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source long = rectangular.stack(level='varno') normalized = ( pd.merge(left=df[['dataset', 'Sample']], right=long[meta_names], on='csv_index') .set_index(['dataset', 'Sample', 'Variable'], append=True) .astype({'Solution': float, 'Uncertainty': float}) .unstack('Variable') ) return normalized def extract_factors(df: pd.DataFrame): names = df.columns[df.columns.get_loc('Solution')].droplevel(0) values = ( names.get_level_values('Variable') .to_series(name='factor', index=names) .str.replace('_', '.') .str.extract(r'(\d+\.\d+)$', expand=False) .dropna() .astype(float) ) return values def main() -> None: df = load() df = normalize_vars(df) # For all datasets and Sample ~ Open-Closed* open_closed = df[df.index.get_level_values('Sample').str.contains('Open-Closed')] # io is converted concentrations from the number embedded in the scaling_factornnn names factors = extract_factors(df) io = conc = factors * 1e9/MW k = open_closed.loc[:, ('Solution', 'k')] k_error = open_closed.loc[:, ('Uncertainty', 'k')] fo = k/(k + 1) fc = 1/(k + 1) fm = 0 h = 1e-8 kh = k + h dk_fo = kh/(kh + 1) dk_fc = 1/(kh + 1) dk_fm = 0 open_errors = k_error**2 * dk_fo**2 closed_errors = k_error**2 * dk_fc**2 monomer_errors = k_error**2 * dk_fm**2 if __name__ == '__main__': main()

added 194 characters in body

Source Link

edited Aug 19, 2023 at 3:11

Reinderien

71.2k
5
76
257

converted = df.loc[ df['Variable'].str.startswith(df['dataset'] == 'WT_2017''scaling_factor') & (df['Sample']df['dataset'] == 'Open-Closed''WT_2017') & df['Variable'].str.startswith('scaling_factor'df['Sample'] == 'Open-Closed'), ['Solution', 'Uncertainty'], ] / MW * 1e9

 Solution Uncertainty varno 22 1 220.922794 1.177406e-07 25 1 173.364558 4.268723e-08 28 1 0.000000 0.000000e+00

converted = df.loc[ (df['dataset'] == 'WT_2017') & (df['Sample'] == 'Open-Closed') & df['Variable'].str.startswith('scaling_factor'), ['Solution', 'Uncertainty'], ] / MW * 1e9

converted = df.loc[ df['Variable'].str.startswith('scaling_factor') & (df['dataset'] == 'WT_2017') & (df['Sample'] == 'Open-Closed'), ['Solution', 'Uncertainty'], ] / MW * 1e9

 Solution Uncertainty varno 22 1 220.922794 1.177406e-07 25 1 173.364558 4.268723e-08 28 1 0.000000 0.000000e+00

Source Link

answered Aug 19, 2023 at 3:03

Reinderien

71.2k
5
76
257

the only way I could get this to work is....bad.

Yes. This is what Pandas was made for, and should replace basically everything that you've done in your current program (plotting aside; that part is vaguely OK).

The only big challenge is that your input file format is truly cursed. It has jagged rows, +/- separators, and what look to be dataset titles interspersed with real data.

The following approach can be used to normalize the data:

import pandas as pd MAX_VARS = 20 MW = 42_000 def load(filename: str = 'scaled_uncertanty_fits.txt') -> pd.DataFrame: df = pd.read_csv( filename, sep=',|±', skiprows=1, # skip the original headers - they aren't wide enough names=[ 'Sample', 'RedChi2', 'DoF', *range(3*MAX_VARS), ] ) # Forward-fill the dataset name (e.g. WT_2017) is_dataset = df.iloc[:, 1].isna() df.insert(loc=0, column='dataset', value=df.loc[is_dataset, 'Sample']) df['dataset'] = df['dataset'].ffill() return df[~is_dataset] def normalize_vars(df: pd.DataFrame) -> pd.DataFrame: """Normalize variable-solution-uncertainty triples""" var_offset = 4 var_cols = df.iloc[:, var_offset:] meta_names = pd.Index(name='varmeta', data=['Variable', 'Solution', 'Uncertainty']) rectangular = pd.DataFrame( index=df.index, columns=pd.MultiIndex.from_product(( meta_names, pd.RangeIndex(name='varno', start=0, stop=MAX_VARS), )) ) n_vars = var_cols.notna().sum(axis=1)//3 for row_vars, group in var_cols.groupby(n_vars): source = group.iloc[:, :row_vars] source.columns = pd.MultiIndex.from_product( (('Variable',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source source = group.iloc[:, row_vars: row_vars*2] source.columns = pd.MultiIndex.from_product( (('Solution',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source source = group.iloc[:, row_vars*2: row_vars*3] source.columns = pd.MultiIndex.from_product( (('Uncertainty',), range(row_vars)), names=('varmeta', 'varno'), ) rectangular.loc[group.index, source.columns] = source long = rectangular.stack(level='varno') normalized = pd.merge( left=df[['dataset', 'Sample', 'RedChi2', 'DoF']], right=long[meta_names], left_index=True, right_on=long.index.get_level_values(0), ).drop('key_0', axis=1).set_index(long.index) return normalized.astype({ 'Solution': float, 'Uncertainty': float, }) def main() -> None: df = load() df = normalize_vars(df) # ... if __name__ == '__main__': main()

From here, everything is "easy". For instance: want to convert concentrations, only for scaling_factor variables, only for the WT_2017 dataset and sample Open-Closed?

converted = df.loc[ (df['dataset'] == 'WT_2017') & (df['Sample'] == 'Open-Closed') & df['Variable'].str.startswith('scaling_factor'), ['Solution', 'Uncertainty'], ] / MW * 1e9