I'm new here so I hope that I will put all needed information
- As the CSV is a huge one (10Go), the URL link is in the code below if needed
- URL link to the data description (column type...)
- Delimiter is \t but they call it CSV (describe in the "data description file").
After replacing wrong delimiter (replace '\n\t' by '\t' when necessary) in the csv file and define data type for each column, I'm trying to read it using the \t delimiter but encounter 2 errors.
1)
parse error on line 1715281 : expected 209 fields, saw 239 --> For that, I try to check issue by using getline and then split the line with delimiter='\t'. I found 'quantity of columns' = len(split(getline)) = 209
error : Unable to parse string "URL link". No issue with all the previous lines before this one
My questions are:
- Why do I get this parser error on line 1715281? Assuming that I correct wrong delimiter at the beginning of my code
- Is it a good approach to use 'getline' and then compare number of columns in the CSV to the len of the split line?
- How to manage the "unable to parse string: URL" when all the previous lines don't generate an issue?
Please find hereunder my codes, comments and full error messages
import os.path import pandas as pd import numpy as np import linecache # data file available under: https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz # it's .csv but delimiter is TAB # generate the path to the file data_local_path = os.getcwd() + '\\' csv_filename = 'en.openfoodfacts.org.products.csv' csv_local_path = data_local_path + csv_filename # generate the path to create the file corrected clean_filename = 'en.off.corrected.csv' clean_local_path = data_local_path + clean_filename # check if the file is already existing, if not then proceed to wrong delimiter replacement if not os.path.isfile(clean_local_path): with open(csv_local_path, 'r',encoding='utf-8') as csv_file, open(clean_local_path, 'a', encoding='utf-8') as clean_file: for row in csv_file: clean_file.write(row.replace('\n\t', '\t')) # columns type are defined under : https://static.openfoodfacts.org/data/data-fields.txt column_names = pd.read_csv(clean_local_path, sep='\t', encoding = 'utf-8', nrows=0).columns.values column_types = {col: 'Int64' for (col) in column_names if col.endswith (('_t', '_n'))} column_types |= {col: float for (col) in column_names if col.endswith (('_100g', '_serving'))} column_types |= {col: str for (col) in column_names if not col.endswith (('_t', '_n', '_100g', '_serving', '_tags'))} print ("number of columns detected: ",len(column_names)) # output is "number of columns detected: 209" print (column_names) # Load the data data = pd.read_csv(clean_local_path, sep='\t', encoding='utf_8', dtype=column_types, parse_dates=[col for (col) in column_names if col.endswith('_datetime')], on_bad_lines='warn' ) # display info data.info() Message error at the line " data = pd.read_csv..." is:
...\AppData\Local\Temp\ipykernel_2824\611804071.py:2: ParserWarning: Skipping line 1715281: expected 209 fields, saw 239 data = pd.read_csv(clean_local_path, sep='\t', encoding='utf_8', --------------------------------------------------------------------------- ValueError Traceback (most recent call last) File lib.pyx:2391, in pandas._libs.lib.maybe_convert_numeric() ValueError: Unable to parse string "https://images.openfoodfacts.org/images/products/356/007/117/1049/front_fr.3.200.jpg" During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) Cell In[6], line 2 1 # Load the data ----> 2 data = pd.read_csv(clean_local_path, sep='\t', encoding='utf_8', 3 dtype=column_types, parse_dates=[col for (col) in column_names if col.endswith('_datetime')], 4 on_bad_lines='warn' 5 ) 6 # display info 7 data.info() File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend) 1013 kwds_defaults = _refine_defaults_read( 1014 dialect, 1015 delimiter, (...) 1022 dtype_backend=dtype_backend, 1023 ) 1024 kwds.update(kwds_defaults) -> 1026 return _read(filepath_or_buffer, kwds) File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\readers.py:626, in _read(filepath_or_buffer, kwds) 623 return parser 625 with parser: --> 626 return parser.read(nrows) File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\readers.py:1923, in TextFileReader.read(self, nrows) 1916 nrows = validate_integer("nrows", nrows) 1917 try: 1918 # error: "ParserBase" has no attribute "read" 1919 ( 1920 index, 1921 columns, 1922 col_dict, -> 1923 ) = self._engine.read( # type: ignore[attr-defined] 1924 nrows 1925 ) 1926 except Exception: 1927 self.close() File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\c_parser_wrapper.py:234, in CParserWrapper.read(self, nrows) 232 try: 233 if self.low_memory: --> 234 chunks = self._reader.read_low_memory(nrows) 235 # destructive to chunks 236 data = _concatenate_chunks(chunks) File parsers.pyx:838, in pandas._libs.parsers.TextReader.read_low_memory() File parsers.pyx:921, in pandas._libs.parsers.TextReader._read_rows() File parsers.pyx:1066, in pandas._libs.parsers.TextReader._convert_column_data() File parsers.pyx:1105, in pandas._libs.parsers.TextReader._convert_tokens() File parsers.pyx:1211, in pandas._libs.parsers.TextReader._convert_with_dtype() File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\arrays\numeric.py:283, in NumericArray._from_sequence_of_strings(cls, strings, dtype, copy) 277 @classmethod 278 def _from_sequence_of_strings( 279 cls, strings, *, dtype: Dtype | None = None, copy: bool = False 280 ) -> Self: 281 from pandas.core.tools.numeric import to_numeric --> 283 scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable") 284 return cls._from_sequence(scalars, dtype=dtype, copy=copy) File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\tools\numeric.py:232, in to_numeric(arg, errors, downcast, dtype_backend) 230 coerce_numeric = errors not in ("ignore", "raise") 231 try: --> 232 values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] 233 values, 234 set(), 235 coerce_numeric=coerce_numeric, 236 convert_to_masked_nullable=dtype_backend is not lib.no_default 237 or isinstance(values_dtype, StringDtype) 238 and not values_dtype.storage == "pyarrow_numpy", 239 ) 240 except (ValueError, TypeError): 241 if errors == "raise": File lib.pyx:2433, in pandas._libs.lib.maybe_convert_numeric() ValueError: Unable to parse string "https://images.openfoodfacts.org/images/products/356/007/117/1049/front_fr.3.200.jpg" at position 1963 'Getline' and 'split' used to check ParserWarning concerning line 1715281
#get the line where the first warning had occured line = linecache.getline(csv_local_path,1715281) print (line) # Split the string using tab delimiter split_list = line.split('\t') # Output the result print("concerning Parser Warning: Skipping line 1715281: expected 209 fields, saw 239") print("number of data detected in the raw 1715281: ",len(split_list)) print ("number of columns detected in CSV: ",len(column_names)) # Output is: # concerning Parser Warning: Skipping line 1715281: expected 209 fields, saw 239 # number of data detected in the raw 1715281: 209 # number of columns detected in CSV: 209 I try "on_bad_lines='skip'" but without success