the code below can help reading the information in the link below into a excel file, maybe others will find it useful as well:
URL: https://docs.google.com/spreadsheets/d/1G8FQNlitoRr1oK2-LZEloeg0_VBP-E0J_WoSXqAhxNo/pubhtml#
import pandas as pd # Step 1: Read data from the Google Sheets URL url = 'https://docs.google.com/spreadsheets/d/1G8FQNlitoRr1oK2-LZEloeg0_VBP-E0J_WoSXqAhxNo/pubhtml' # Read all tables from the HTML page tables = pd.read_html(url) # Assuming the first table is 'orals' and the second is 'posters' df_orals = tables[0] df_posters = tables[1] # Step 2: Clean the DataFrames by removing columns with many NaNs and rows with any NaNs def clean_dataframe(df, threshold=0.5): # Calculate the percentage of NaNs in each column nan_percent = df.isna().mean() # Keep columns where the percentage of NaNs is less than the threshold cols_to_keep = nan_percent[nan_percent < threshold].index df_clean = df[cols_to_keep] # Drop rows with any NaNs df_clean = df_clean.dropna() # Reset the index df_clean.reset_index(drop=True, inplace=True) return df_clean # Apply cleaning to both DataFrames df_orals_clean = clean_dataframe(df_orals) df_posters_clean = clean_dataframe(df_posters) # Step 3: Remove the first column and replace column headers with the first row def modify_dataframe(df): # Remove the first column df_modified = df.iloc[:, 1:] # Replace column headers with the first row df_modified.columns = df_modified.iloc[0] # Remove the first row since it's now the header df_modified = df_modified.iloc[1:] # Reset the index df_modified.reset_index(drop=True, inplace=True) return df_modified # Modify both DataFrames df_orals_modified = modify_dataframe(df_orals_clean) df_posters_modified = modify_dataframe(df_posters_clean) # Step 4: Save the modified DataFrames to an Excel file with two sheets output_filename = 'orals_and_posters.xlsx' # Use ExcelWriter to write multiple DataFrames to different sheets with pd.ExcelWriter(output_filename, engine='openpyxl') as writer: # Write each DataFrame to a specific sheet df_orals_modified.to_excel(writer, sheet_name='Orals', index=False) df_posters_modified.to_excel(writer, sheet_name='Posters', index=False) print(f"DataFrames saved to '{output_filename}' with two sheets: 'Orals' and 'Posters'")