Here's a way to do what I believe your question is asking:
# Parse and store the first 3 values in column 0 so that we can use them # as values for 3 new columns later. new_columns = [x.split(':')[-1].strip() for x in df1.iloc[0:3,0].to_list()] # Transpose so that we can use set_index() to replace the index # (the columns from the original df1) to ['Item: Food', NaN, NaN, NaN], # then transpose back so that the new index becomes the columns. df1 = df1.T.set_index(3).T # Use reset_index() to replace the index with a fresh range # index (0, 1, 2, ...) so we can use iloc() to discard the # first 3 unwanted rows, then call reset_index() again. df1 = df1.reset_index(drop=True).iloc[3:].reset_index(drop=True) # Get rid of vestigial name for columns. df1.columns.names = [None] # Add the three new columns set to the values saved earlier. df1[['View Description', 'Series ID', 'Series Name']] = new_columns
Here is full test case (with the above annotated code compressed into fewer lines):
import pandas as pd s = [ ' | NA | NA_1 | NA_2 | NA_3 |', '0 | 12-Month Percent Change | NaN | NaN | NaN |', '1 | Series Id: CUUR0000SAF1 | NaN | NaN | NaN |', '2 | Item: Food | NaN | NaN | NaN |', '3 | Year | Jan | Feb | Mar |', '4 | 2010 | -0.4 | -0.2 | 0.2 |', '5 | 2011 | 1.8 | 2.3 | 2.9 |'] df1 = pd.DataFrame( [[x.strip() for x in y.split('|')[1:-1]] for y in s[1:]], columns = [x.strip() for x in s[0].split('|')[1:-1]], ) print(df1) new_columns = [x.split(':')[-1].strip() for x in df1.iloc[0:3,0].to_list()] df1 = df1.T.set_index(3).T.reset_index(drop=True).iloc[3:].reset_index(drop=True) df1.columns.names = [None] df1[['View Description', 'Series ID', 'Series Name']] = new_columns print(df1)
Output:
NA NA_1 NA_2 NA_3 0 12-Month Percent Change NaN NaN NaN 1 Series Id: CUUR0000SAF1 NaN NaN NaN 2 Item: Food NaN NaN NaN 3 Year Jan Feb Mar 4 2010 -0.4 -0.2 0.2 5 2011 1.8 2.3 2.9 Year Jan Feb Mar View Description Series ID Series Name 0 2010 -0.4 -0.2 0.2 12-Month Percent Change CUUR0000SAF1 Food 1 2011 1.8 2.3 2.9 12-Month Percent Change CUUR0000SAF1 Food
UPDATE: This is code that allows us to configure (1) the cell coordinates of each of 3 cells to be used for new column values (new_col_coords) and (2) the header_row above which rows are discarded:
import pandas as pd s = [ ' | NA | NA_1 | NA_2 | NA_3 |', '0 | 12-Month Percent Change | NaN | NaN | NaN |', '91 | To be discarded | NaN | NaN | NaN |', '1 | Series Id: CUUR0000SAF1 | Abc | NaN | NaN |', '92 | To be discarded | NaN | NaN | NaN |', '93 | To be discarded | NaN | NaN | NaN |', '94 | To be discarded | NaN | NaN | NaN |', '2 | Item: Food | Xyz | NaN | NaN |', '95 | To be discarded | NaN | NaN | NaN |', '96 | To be discarded | NaN | NaN | NaN |', '97 | To be discarded | NaN | NaN | NaN |', '98 | To be discarded | NaN | NaN | NaN |', '3 | Year | Jan | Feb | Mar |', '4 | 2010 | -0.4 | -0.2 | 0.2 |', '5 | 2011 | 1.8 | 2.3 | 2.9 |'] df1 = pd.DataFrame( [[x.strip() for x in y.split('|')[1:-1]] for y in s[1:]], columns = [x.strip() for x in s[0].split('|')[1:-1]], ) print(df1) # parse and store the 3 values at specified coordinates so that we can use them as values for 3 new columns later new_col_coords = [[0,0], [2,1], [6,1]] new_columns = [x.split(':')[-1].strip() for x in [df1.iloc[i, j] for i, j in new_col_coords]] header_row = 11 # Here's how to do everything that follows in one line of code: #df1 = df1.T.set_index(header_row).T.reset_index(drop=True).iloc[header_row:].reset_index(drop=True) # Transpose so that we can use set_index() to change the index to ['Item: Food', NaN, NaN, NaN], then transpose back so that index becomes the columns df1 = df1.T.set_index(header_row).T # Use reset_index() to replace the index with a fresh range index (0, 1, 2, ...) so we can use iloc() to discard the unwanted rows above header_row, then call reset_index() again df1 = df1.reset_index(drop=True).iloc[header_row:].reset_index(drop=True) # Get rid of vestigial name for columns df1.columns.names = [None] # Add the three new columns set to the values saved earlier df1[['View Description', 'Series ID', 'Series Name']] = new_columns print(df1)
Output:
NA NA_1 NA_2 NA_3 0 12-Month Percent Change NaN NaN NaN 1 To be discarded NaN NaN NaN 2 Series Id: CUUR0000SAF1 Abc NaN NaN 3 To be discarded NaN NaN NaN 4 To be discarded NaN NaN NaN 5 To be discarded NaN NaN NaN 6 Item: Food Xyz NaN NaN 7 To be discarded NaN NaN NaN 8 To be discarded NaN NaN NaN 9 To be discarded NaN NaN NaN 10 To be discarded NaN NaN NaN 11 Year Jan Feb Mar 12 2010 -0.4 -0.2 0.2 13 2011 1.8 2.3 2.9 Year Jan Feb Mar View Description Series ID Series Name 0 2010 -0.4 -0.2 0.2 12-Month Percent Change Abc Xyz 1 2011 1.8 2.3 2.9 12-Month Percent Change Abc Xyz