At this time, my script will check multiple url if some 5 different type of keywords are present in the webpage. Depending of which keyword is found or not, it will output "ok" or "no".
I use set_page_load_timeout(30) to avoid infinite load of a url.
Problem : some webpages doesn't load fully before timeout (even if it's a "very" long timeout). But I can see visually (no headless) that the page is loaded. At least it could check the keywords in the webpage but it doesn't and after timeout, it display "fail" and the scrape saying "no" doesn't show to the final output.
So I don't want to put an except after 30 seconds but I want to stop loading the page after 30 seconds and takes what it can be taken.
My code :
# coding=utf-8 import re sites=[] keywords_1=[] keywords_2=[] keywords_3=[] keywords_4=[] keywords_5=[] import sys from selenium import webdriver import csv import urllib.parse from datetime import datetime from datetime import date from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options def reader3(filename): with open(filename, 'r') as csvfile: # creating a csv reader object csvreader = csv.reader(csvfile) # extracting field names through first row # extracting each data row one by one for row in csvreader: sites.append(str(row[0]).lower()) try: reader3("data/script/filter_domain_OUTPUT.csv") except Exception as e: print(e) sys.exit() exc=[] def reader3(filename): with open(filename, 'r') as csvfile: # creating a csv reader object csvreader = csv.reader(csvfile) # extracting field names through first row # extracting each data row one by one for row in csvreader: exc.append(str(row[0]).lower()) try: reader3("data/script/checking_EXCLUDE.csv") except Exception as e: print(e) sys.exit() def reader2(filename): with open(filename, 'r') as csvfile: # creating a csv reader object csvreader = csv.reader(csvfile) # extracting field names through first row # extracting each data row one by one for row in csvreader: keywords_1.append(str(row[0]).lower()) keywords_2.append(str(row[1]).lower()) keywords_3.append(str(row[2]).lower()) keywords_4.append(str(row[3]).lower()) keywords_5.append(str(row[4]).lower()) try: reader2("data/script/checking_KEYWORD.csv") except Exception as e: print(e) sys.exit() chrome_options = Options() chrome_options.page_load_strategy = 'none' chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--lang=en') chrome_options.add_argument('--disable-notifications') #chrome_options.headless = True chrome_options.add_argument('start-maximized') chrome_options.add_argument('enable-automation') chrome_options.add_argument('--disable-infobars') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-browser-side-navigation') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) for site in sites: try: status_1 = "no" status_2 = "no" status_3 = "no" status_4 = "no" status_5 = "no" now = datetime.now() current_time = now.strftime("%H:%M:%S") today = date.today() print("[" + current_time + "] " + str(site)) if 'http' in site: driver.get(site) else: driver.get("http://" + site) r=str(driver.page_source).lower() driver.set_page_load_timeout(30) for keyword_1 in keywords_1: if keyword_1 in r: status_1="ok" print("home -> " +str(keyword_1)) break for keyword_2 in keywords_2: if keyword_2 in r: status_2="ok" print("home -> " +str(keyword_2)) break for keyword_3 in keywords_3: if keyword_3 in r: status_3="ok" print("home -> " +str(keyword_3)) break for keyword_4 in keywords_4: if keyword_4 in r: status_4="ok" print("home -> " +str(keyword_4)) break for keyword_5 in keywords_5: if keyword_5 in r: status_5="ok" print("Home ->" +str(keyword_5)) break with open('data/script/checking_OUTPUT.csv', mode='a') as employee_file: employee_writer = csv.writer(employee_file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL,lineterminator='\n') write=[site,status_1,status_2,status_3,status_4,status_5] employee_writer.writerow(write) except Exception as e: #driver.delete_all_cookies() print("Fail") driver.quit()