How to scrape a website that requires login first with Python

Question

First of all, I think it's worth saying that, I know there are a bunch of similar questions but NONE of them works for me...

I'm a newbie on Python, html and web scraper. I'm trying to scrape user information from a website which needs to login first. In my tests I use scraper my email settings from github as examples. The main page is 'https://github.com/login' and the target page is 'https://github.com/settings/emails'

Here are a list of methods I've tried

##################################### Method 1 import mechanize import cookielib from BeautifulSoup import BeautifulSoup import html2text br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-agent', 'Chrome')] # The site we will navigate into, handling it's session br.open('https://github.com/login') for f in br.forms(): print f br.select_form(nr=0) # User credentials br.form['login'] = 'myusername' br.form['password'] = 'mypwd' # Login br.submit() br.open('github.com/settings/emails').read() ################ Method 2 import urllib, urllib2, cookielib username = 'myusername' password = 'mypwd' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) login_data = urllib.urlencode({'username' : username, 'j_password' : password}) opener.open('https://github.com/login', login_data) resp = opener.open('https://github.com/settings/emails') print resp.read() ############# Method 3 import urllib opener = urllib.FancyURLopener() print opener.open('http://myusername:[email protected]/settings/emails').read() ########## Method 4 import mechanize import cookielib br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) br.addheaders = [('User-agent', 'Chrome')] br.add_password('https://github.com/settings/emails', 'myusername', 'mypwd') br.open('https://github.com/settings/emails') print br.response().read() ############ Methods 5 from requests import session payload = { 'action': 'login', 'username': 'myusername', 'password': 'mypwd' } with session() as c: c.post('https://github.com/login', data=payload) request = c.get('https://github.com/settings/emails') print request.headers print request.text ########### Method 6 import requests from requests.packages.urllib3 import add_stderr_logger import sys from bs4 import BeautifulSoup as bs add_stderr_logger() s = requests.Session() s.headers['User-Agent'] = 'Chrome' username = 'myusername' password = 'mypwd' url = 'https://github.com/login' # after examining the HTML of the website you're trying to log into # set name_form to the name of the form element that contains the name and # set password_form to the name of the form element that will contain the password login = {'login': username, 'password': password} login_response = s.post(url, data=login) for r in login_response.history: if r.status_code == 401: # 401 means authentication failed print 'error!' sys.exit(1) # abort pdf_response = s.get('https://github.com/settings/emails') # Your cookies and headers are automatically included soup = bs(pdf_response.content)

Also I've read some discussions about differences between HTTP Authentication and cookies. Still none of them worked.

Please help and any help would be appreciated. Thank you very much.

Holy Mackerel · Accepted Answer · 2014-10-06 23:43:57Z

39

This works for me:

##################################### Method 1 import mechanize import cookielib from BeautifulSoup import BeautifulSoup import html2text # Browser br = mechanize.Browser() # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-agent', 'Chrome')] # The site we will navigate into, handling it's session br.open('https://github.com/login') # View available forms for f in br.forms(): print f # Select the second (index one) form (the first form is a search query box) br.select_form(nr=1) # User credentials br.form['login'] = 'mylogin' br.form['password'] = 'mypass' # Login br.submit() print(br.open('https://github.com/settings/emails').read())

You were not far off at all!

edited Oct 6, 2014 at 23:43

answered Nov 18, 2013 at 4:03

Holy Mackerel

3,3091 gold badge29 silver badges41 bronze badges

Sign up to request clarification or add additional context in comments.

8 Comments

user2830451 Over a year ago

Could you give me more details on that? I did look up the scripts but did not see my username and password. Also I got this in forms from first method: br.open('github.com/login') for f in br.forms(): print f and I got the following information: <GET github.com/search application/x-www-form-urlencoded <TextControl(q=)> <HiddenControl(ref=cmdform) (readonly)>> <POST github.com/session application/x-www-form-urlencoded <HiddenControl(authenticity_token=dJrtaPu6AXWjB1jXA5i5V1qUyQ32CgLUKSnfMvhazV4=) (readonly)><TextControl(login=)><PasswordControl(password=)>

Holy Mackerel Over a year ago

That will be from two forms, the first (GET) is a search form, you are interested in the second (POST) form that has the the fields for "login" and "password", and the hidden "authenticity_token" field which has a value. So you will need to submit that field and value together with the other two fields (which you will need to provide values for - your username and password)

Holy Mackerel Over a year ago

In attempt 1 you will need to change br.select_form(nr=0) to br.select_form(nr=1). Also please remove your comment above if that is your real password, its not a good idea to make that public!

user2830451 Over a year ago

That works, thank you very much! An additional question: how would I know if I should select first or second form? If I scrape for my gmail then I do like this: br.open('github.com/login') then br.select_form(nr=0) then br.open('github.com/login').read() it worked. (The only website I used to succeeded). That's why I didn't even think about it's because of the form I selected. Besides thank you for pointing out that password issue to me!

Holy Mackerel Over a year ago

In your first comment above, you should see that when you printed the forms you had two forms, the first <GET github.com/search...> and the second <POST github.com/session...> You can see the second form contains fields called login and password so that is what indicates that the second form is a login form. It will be different for different webpages.

|

Sojan Jose · Accepted Answer · 2014-12-17 22:08:08Z

would love to add my solution alongside . this answer mainly follows the hacky / lazy approach i always follow in everything i do. went on with mainly because , i was too lazy to handle the cookies, session data etc .

this solution is of most use if you want to scrape multiple pages of a website after logging in with single account credentials (eg all your pinterest boards) . not if u want to automate authentication using multiple accounts

so my solution is selenium along with firefox profiles.

Create a new firefox profile you create a new firefox profile, note the location where its stored, open firefox in corresponding profile. and login to the website manually . details about firefox profiles
now use selenium with this profile selenium session will use the cookies and session data from firefox profile so your authentication stays.

i devised this mechanism when i came across need to scrape few pinterest pages, i have added few lines of code from the sample showing how to use the profile. suit the code according to your needs.

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import TimeoutException from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoAlertPresentException #replace with your firefox profile fp=webdriver.FirefoxProfile('C:/Users/SJ/AppData/Roaming/Mozilla/Firefox/Profiles/hlsfrs2o.scrape') #enter your url here url="" driver = webdriver.Firefox(fp) driver.get(url) html_source = driver.page_source

Ranvijay sachan · Accepted Answer · 2015-07-27 15:27:36Z

The classic way to approach this problem is:

launch a browser, go to site and search for the login page
inspect the source code of the page to find out: I. which one is the login form (a page can have many forms, but usually one of them is the login form) II. which are the field names used for username and password (these could vary a lot) III. if there are other fields that must be submitted (like an authentication token)
write the Scrapy spider to replicate the form submission using FormRequest

Being fans of automation, we figured we could write some code to automate point 2 (which is actually the most time-consuming) and the result is login form, a library to automatically fill login forms given the login page, username and password. Here is the code of a simple spider that would use loginform to login to sites automatically.

githubloginspider.py

from scrapy.spider import BaseSpider from scrapy.http import FormRequest from scrapy.http.request import Request from loginform import fill_login_form from scrapy import log from scraping.articles import ArticleItem class GitHubLogin(BaseSpider): name = 'GitHubLogin' allowed_domains = ['github.com'] start_urls = ['http://github.com/login'] login_user = 'ranvijay5686' login_pass = '' def parse(self, response): (args, url, method) = fill_login_form(response.url, response.body, self.login_user, self.login_pass) return FormRequest(url, method=method, formdata=args, callback=self.after_login) def after_login(self, response): # for link in response.xpath("//*[@id='site-container']/div[2]/div[4]/p/a/@href").extract(): item = ArticleItem() item['title'] = 'ranvijay' log.msg('*************** : ' + str(response.xpath("//form[@class='subnav-search left']/input/@value" ).extract())) item['url'] = \ response.xpath("//*[@id='site-container']/div[1]/div/div/span/span/text()" ).extract() yield item

items.py

from scrapy.item import Item, Field class ArticleItem(Item): title = Field() url = Field()

loginform.py

import sys from argparse import ArgumentParser from collections import defaultdict from lxml import html __version__ = '1.0' # also update setup.py def _form_score(form): score = 0 # In case of user/pass or user/pass/remember-me if len(form.inputs.keys()) in (2, 3): score += 10 typecount = defaultdict(int) for x in form.inputs: type_ = (x.type if isinstance(x, html.InputElement) else 'other' ) typecount[type_] += 1 if typecount['text'] > 1: score += 10 if not typecount['text']: score -= 10 if typecount['password'] == 1: score += 10 if not typecount['password']: score -= 10 if typecount['checkbox'] > 1: score -= 10 if typecount['radio']: score -= 10 return score def _pick_form(forms): """Return the form most likely to be a login form""" return sorted(forms, key=_form_score, reverse=True)[0] def _pick_fields(form): """Return the most likely field names for username and password""" userfield = passfield = emailfield = None for x in form.inputs: if not isinstance(x, html.InputElement): continue type_ = x.type if type_ == 'password' and passfield is None: passfield = x.name elif type_ == 'text' and userfield is None: userfield = x.name elif type_ == 'email' and emailfield is None: emailfield = x.name return (userfield or emailfield, passfield) def submit_value(form): """Returns the value for the submit input, if any""" for x in form.inputs: if x.type == 'submit' and x.name: return [(x.name, x.value)] else: return [] def fill_login_form( url, body, username, password, ): doc = html.document_fromstring(body, base_url=url) form = _pick_form(doc.xpath('//form')) (userfield, passfield) = _pick_fields(form) form.fields[userfield] = username form.fields[passfield] = password form_values = form.form_values() + submit_value(form) return (form_values, form.action or form.base_url, form.method) def main(): ap = ArgumentParser() ap.add_argument('-u', '--username', default='username') ap.add_argument('-p', '--password', default='secret') ap.add_argument('url') args = ap.parse_args() try: import requests except ImportError: print 'requests library is required to use loginform as a tool' r = requests.get(args.url) (values, action, method) = fill_login_form(args.url, r.text, args.username, args.password) print '''url: {0} method: {1} payload:'''.format(action, method) for (k, v) in values: print '- {0}: {1}'.format(k, v) if __name__ == '__main__': sys.exit(main())

Collectives™ on Stack Overflow

How to scrape a website that requires login first with Python

3 Answers 3

8 Comments

Comments

Comments

Linked

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

8 Comments

Comments

Comments

Linked

Related