joeyism
diff --git a/‎linkedin_scraper/company.py‎
Lines changed: 3 additions & 3 deletions b/‎linkedin_scraper/company.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎linkedin_scraper/person.py‎
Lines changed: 19 additions & 74 deletions b/‎linkedin_scraper/person.py‎
Lines changed: 19 additions & 74 deletions
diff --git a/‎samples/scrape_person.py‎
Lines changed: 31 additions & 4 deletions b/‎samples/scrape_person.py‎
Lines changed: 31 additions & 4 deletions
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
 
  driver.get(self.linkedin_url)
 
- _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
+ _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
 
  navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
 
- self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
+ self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
 
  # Click About Tab or View All Link
  try:
@@ -360,6 +360,6 @@ def __repr__(self):
  _output['affiliated_companies'] = self.affiliated_companies
  _output['employees'] = self.employees
  _output['headcount'] = self.headcount
- 
+
  return json.dumps(_output).replace('\n', '')
 
@@ -1,3 +1,5 @@
+import time
+
 import requests
 from selenium import webdriver
 from selenium.webdriver.common.by import By
@@ -115,11 +117,13 @@ def get_experiences(self):
  self.scroll_to_bottom()
  main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
  for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
- position = position.find_element(By.XPATH, "//div[@data-view-name='profile-component-entity']")
+ position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
  company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
 
  # company elem
  company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
+ if not company_linkedin_url:
+ continue
 
  # position details
  position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
  company = outer_positions[0].find_element(By.TAG_NAME,"span").text
  work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
  location = outer_positions[2].find_element(By.TAG_NAME,"span").text
+ else:
+ position_title = ""
+ company = outer_positions[0].find_element(By.TAG_NAME,"span").text
+ work_times = ""
+ location = ""
+
 
  times = work_times.split("·")[0].strip() if work_times else ""
  duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
 
  from_date = " ".join(times.split(" ")[:2]) if times else ""
  to_date = " ".join(times.split(" ")[3:]) if times else ""
-
- if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")) > 1:
- descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")
+ if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
+ inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
+ .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
+ .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
+ else:
+ inner_positions = []
+ if len(inner_positions) > 1:
+ descriptions = inner_positions
  for description in descriptions:
  res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
  position_title_elem = res[0] if len(res) > 0 else None
@@ -249,7 +264,6 @@ def get_name_and_location(self):
  self.name = top_panel.find_element(By.TAG_NAME, "h1").text
  self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
 
-
  def get_about(self):
  try:
  about = self.driver.find_element(By.ID,"about").find_element(By.XPATH,"..").find_element(By.CLASS_NAME,"display-flex").text
@@ -293,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
  self.get_educations()
 
  driver.get(self.linkedin_url)
-
- # get interest
- try:
-
- _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
- EC.presence_of_element_located(
- (
- By.XPATH,
- "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
- )
- )
- )
- interestContainer = driver.find_element(By.XPATH,
- "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
- )
- for interestElement in interestContainer.find_elements(By.XPATH,
- "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
- ):
- interest = Interest(
- interestElement.find_element(By.TAG_NAME, "h3").text.strip()
- )
- self.add_interest(interest)
- except:
- pass
-
- # get accomplishment
- try:
- _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
- EC.presence_of_element_located(
- (
- By.XPATH,
- "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
- )
- )
- )
- acc = driver.find_element(By.XPATH,
- "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
- )
- for block in acc.find_elements(By.XPATH,
- "//div[@class='pv-accomplishments-block__content break-words']"
- ):
- category = block.find_element(By.TAG_NAME, "h3")
- for title in block.find_element(By.TAG_NAME,
- "ul"
- ).find_elements(By.TAG_NAME, "li"):
- accomplishment = Accomplishment(category.text, title.text)
- self.add_accomplishment(accomplishment)
- except:
- pass
-
- # get connections
- try:
- driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
- _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
- EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
- )
- connections = driver.find_element(By.CLASS_NAME, "mn-connections")
- if connections is not None:
- for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
- anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
- url = anchor.get_attribute("href")
- name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
- occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
-
- contact = Contact(name=name, occupation=occupation, url=url)
- self.add_contact(contact)
- except:
- connections = None
-
  if close_on_complete:
  driver.quit()
 
 
@@ -1,9 +1,36 @@
 import os
-from linkedin_scraper import Person, actions
+from linkedin_scraper import Person, actions, Company
 from selenium import webdriver
-driver = webdriver.Chrome("./chromedriver")
+
+driver = webdriver.Chrome()
 
 email = os.getenv("LINKEDIN_USER")
 password = os.getenv("LINKEDIN_PASSWORD")
-actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
-person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
+actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
+user_input = []
+urls = []
+while True:
+ user_input = input("Enter a comma-separated list of linkedin urls: ")
+ if user_input == "exit":
+ break
+ urls = user_input.split(",")
+ results = []
+ for url in urls:
+ print(f'scraping {url}')
+ person = Person(url, driver=driver, close_on_complete=False)
+ company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
+ results.append((person, company))
+
+ print('RESULTS:')
+ print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
+ for person, company in results:
+ experience = person.experiences[0]
+ print(f'"{person.name}", '
+ f'"{person.location}", '
+ f'"{experience.position_title}", '
+ f'"{experience.institution_name}", '
+ f'"{experience.linkedin_url}", '
+ f'"{company.industry}", '
+ f'"{company.website}", '
+ f'"{company.company_size}", '
+ )