I have read the existing posts regarding how to remove non-ASCI characters of a string in python. But my issue is that when I want to apply it to a dataframe which I have read from a csv file, it doesn't work. Any idea why?
import pandas as pd import numpy as np import re import string import unicodedata def preprocess(x): # Convert to unicode text = unicode(x, "utf8") # Convert back to ascii x = unicodedata.normalize('NFKD',text).encode('ascii','ignore') return x preprocess("Ludwig Maximilian University of Munich / M\xc3\xbcnchen (LMU) and Siemens AG") 'Ludwig Maximilian University of Munich / Munchen (LMU) and Siemens AG'
df = pd.DataFrame(["Ludwig Maximilian University of Munich / M\xc3\xbcnchen (LMU) and Siemens AG"]) df.columns=['text'] df['text'] = df['text'].apply(lambda x: preprocess(x) if(pd.notnull(x)) else x) df['text'][0] 'Ludwig Maximilian University of Munich / Munchen (LMU) and Siemens AG'
df1 = pd.read_csv('sample.csv') df1['text'] = df1['text'].apply(lambda x: preprocess(x) if(pd.notnull(x)) else x) df1['text'][0] 'Ludwig Maximilian University of Munich / M\xc3\xbcnchen (LMU) and Siemens AG'

