Skip to content

Commit 8aa4580

Browse files
authored
Merge pull request #2 from axblueblader/clean_scrape
Clean scrape
2 parents 37bdaeb + 31612bf commit 8aa4580

File tree

3 files changed

+53
-81
lines changed

3 files changed

+53
-81
lines changed

linkedin_scraper/company.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
184184

185185
driver.get(self.linkedin_url)
186186

187-
_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
187+
_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
188188

189189
navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
190190

191-
self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
191+
self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
192192

193193
# Click About Tab or View All Link
194194
try:
@@ -360,6 +360,6 @@ def __repr__(self):
360360
_output['affiliated_companies'] = self.affiliated_companies
361361
_output['employees'] = self.employees
362362
_output['headcount'] = self.headcount
363-
363+
364364
return json.dumps(_output).replace('\n', '')
365365

linkedin_scraper/person.py

Lines changed: 19 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import time
2+
13
import requests
24
from selenium import webdriver
35
from selenium.webdriver.common.by import By
@@ -115,11 +117,13 @@ def get_experiences(self):
115117
self.scroll_to_bottom()
116118
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
117119
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
118-
position = position.find_element(By.XPATH, "//div[@data-view-name='profile-component-entity']")
120+
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
119121
company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
120122

121123
# company elem
122124
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
125+
if not company_linkedin_url:
126+
continue
123127

124128
# position details
125129
position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
143147
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
144148
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
145149
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
150+
else:
151+
position_title = ""
152+
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
153+
work_times = ""
154+
location = ""
155+
146156

147157
times = work_times.split("·")[0].strip() if work_times else ""
148158
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
149159

150160
from_date = " ".join(times.split(" ")[:2]) if times else ""
151161
to_date = " ".join(times.split(" ")[3:]) if times else ""
152-
153-
if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")) > 1:
154-
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")
162+
if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
163+
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
164+
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
165+
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
166+
else:
167+
inner_positions = []
168+
if len(inner_positions) > 1:
169+
descriptions = inner_positions
155170
for description in descriptions:
156171
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
157172
position_title_elem = res[0] if len(res) > 0 else None
@@ -249,7 +264,6 @@ def get_name_and_location(self):
249264
self.name = top_panel.find_element(By.TAG_NAME, "h1").text
250265
self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
251266

252-
253267
def get_about(self):
254268
try:
255269
about = self.driver.find_element(By.ID,"about").find_element(By.XPATH,"..").find_element(By.CLASS_NAME,"display-flex").text
@@ -293,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
293307
self.get_educations()
294308

295309
driver.get(self.linkedin_url)
296-
297-
# get interest
298-
try:
299-
300-
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
301-
EC.presence_of_element_located(
302-
(
303-
By.XPATH,
304-
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
305-
)
306-
)
307-
)
308-
interestContainer = driver.find_element(By.XPATH,
309-
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
310-
)
311-
for interestElement in interestContainer.find_elements(By.XPATH,
312-
"//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
313-
):
314-
interest = Interest(
315-
interestElement.find_element(By.TAG_NAME, "h3").text.strip()
316-
)
317-
self.add_interest(interest)
318-
except:
319-
pass
320-
321-
# get accomplishment
322-
try:
323-
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
324-
EC.presence_of_element_located(
325-
(
326-
By.XPATH,
327-
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
328-
)
329-
)
330-
)
331-
acc = driver.find_element(By.XPATH,
332-
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
333-
)
334-
for block in acc.find_elements(By.XPATH,
335-
"//div[@class='pv-accomplishments-block__content break-words']"
336-
):
337-
category = block.find_element(By.TAG_NAME, "h3")
338-
for title in block.find_element(By.TAG_NAME,
339-
"ul"
340-
).find_elements(By.TAG_NAME, "li"):
341-
accomplishment = Accomplishment(category.text, title.text)
342-
self.add_accomplishment(accomplishment)
343-
except:
344-
pass
345-
346-
# get connections
347-
try:
348-
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
349-
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
350-
EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
351-
)
352-
connections = driver.find_element(By.CLASS_NAME, "mn-connections")
353-
if connections is not None:
354-
for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
355-
anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
356-
url = anchor.get_attribute("href")
357-
name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
358-
occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
359-
360-
contact = Contact(name=name, occupation=occupation, url=url)
361-
self.add_contact(contact)
362-
except:
363-
connections = None
364-
365310
if close_on_complete:
366311
driver.quit()
367312

samples/scrape_person.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,36 @@
11
import os
2-
from linkedin_scraper import Person, actions
2+
from linkedin_scraper import Person, actions, Company
33
from selenium import webdriver
4-
driver = webdriver.Chrome("./chromedriver")
4+
5+
driver = webdriver.Chrome()
56

67
email = os.getenv("LINKEDIN_USER")
78
password = os.getenv("LINKEDIN_PASSWORD")
8-
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
9-
person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
9+
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
10+
user_input = []
11+
urls = []
12+
while True:
13+
user_input = input("Enter a comma-separated list of linkedin urls: ")
14+
if user_input == "exit":
15+
break
16+
urls = user_input.split(",")
17+
results = []
18+
for url in urls:
19+
print(f'scraping {url}')
20+
person = Person(url, driver=driver, close_on_complete=False)
21+
company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
22+
results.append((person, company))
23+
24+
print('RESULTS:')
25+
print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
26+
for person, company in results:
27+
experience = person.experiences[0]
28+
print(f'"{person.name}", '
29+
f'"{person.location}", '
30+
f'"{experience.position_title}", '
31+
f'"{experience.institution_name}", '
32+
f'"{experience.linkedin_url}", '
33+
f'"{company.industry}", '
34+
f'"{company.website}", '
35+
f'"{company.company_size}", '
36+
)

0 commit comments

Comments
 (0)