I'm a newbie getting into web scrapers. I've made something that works, it takes 3.2 hours to complete job and randomly have 10 lines blank each time I run this job. Help is much appreciated!
import sys import pandas as pd from selenium import webdriver import time from datetime import datetime from bs4 import BeautifulSoup from selenium.webdriver.support.select import Select from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import NoSuchElementException from selenium.webdriver import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from webdriver_manager.chrome import ChromeDriverManager def getBrowser(): options = Options() options.add_argument("--incognito") global browser options.add_argument("start-maximized") s = Service('''C:\\Users\\rajes\\yogita\\drivers\\chromedriver.exe''') browser = webdriver.Chrome('''C:\\Users\\rajes\\yogita\\drivers\\chromedriver.exe''') return browser def getISINUrls(browser): url = 'http://www.nasdaqomxnordic.com/bonds/denmark/' browser.get(url) browser.maximize_window() time.sleep(1) bonds = {} try: getUrls(browser, bonds) pg_down = browser.find_element(By.CSS_SELECTOR, "#bondsSearchDKOutput > div:nth-child(1) > table > tbody > tr > td.pgDown") browser.execute_script("arguments[0].click();", pg_down) time.sleep(1) while (True): # pages = browser.find_element(By.ID, 'bondsSearchDKOutput') getUrls(browser, bonds) pg_down = browser.find_element(By.CSS_SELECTOR, "#bondsSearchDKOutput > div:nth-child(1) > table > tbody > tr > td.pgDown") browser.execute_script("arguments[0].click();", pg_down) time.sleep(1) except NoSuchElementException as e: pass return bonds def getUrls(browser, bonds): hrefs_in_table = browser.find_elements(By.XPATH, '//a[@href]') count = 0 for element in hrefs_in_table: href = element.get_attribute('href') if 'microsite?Instrumen' in href: bonds[element.text] = href count += 1 def saveURLs(bond): filename = "linkstoday.txt" fo = open(filename, "w") for k, v in bonds.items(): fo.write(str(v) + '\n') fo.close() def getSleepTime(count): first = 1 res = 1 i = 0; while i < count: i += 1 temp = res res = temp + first first = temp return res def getISINData(browser2): with open("linkstoday.txt", "r") as a_file: denmark_drawing = [] for line in a_file: result_found = False count = 2 Isin_code = str() short_name = str() Volume_circulating = str() Repayment_date = str() Drawing_percent = str() wait_time = getSleepTime(0) + 1 while not result_found and count < 5: stripped_line = line.strip() browser2.get(stripped_line) browser2.maximize_window() time.sleep(getSleepTime(count) + 1) WebDriverWait(browser2, 1).until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#ui-id-3 > span'))).click() time.sleep(getSleepTime(count)) Isin_code = browser2.find_element(By.CSS_SELECTOR, '#db-f-isin').text short_name = browser2.find_element(By.CSS_SELECTOR, '#db-f-nm').text Volume_circulating = browser2.find_element(By.CSS_SELECTOR, '#db-f-oa').text Repayment_date = browser2.find_element(By.CSS_SELECTOR, '#db-f-drd').text Drawing_percent = browser2.find_element(By.CSS_SELECTOR, '#db-f-dp').text if Isin_code == " ": count += 1 else: result_found = True temp_data = [Isin_code, short_name, Volume_circulating, Repayment_date, Drawing_percent] denmark_drawing.append(temp_data) # Writing data to dataframe df3 = pd.DataFrame(denmark_drawing, columns=['ISIN', 'Shortname', 'OutstandingVolume', 'Repaymentdate', 'Drawingpercent']) df3.to_csv('Denamrkscrapedsata_20220121.csv', index=False) if __name__ == "__main__": browser = getBrowser() print(f'''Call to getISINUrls start at: {datetime.now()}''') bonds = getISINUrls(browser) print(f'''Call to getISINUrls ends at : {datetime.now()}''') print(f'''total records: {len(bonds)}''') browser.close() browser2 = getBrowser() print(f'''Call to getISINData start at: {datetime.now()}''') getISINData(browser2) print(f'''Call to getISINData ends at : {datetime.now()}''') saveURLs(bonds) browser2.close() sys.exit(0)