I've written a class that will start from a random Wikipedia page, then choose the first link in the main body, and then navigate following the links until it finds the Philosophy page. When I run the testCrawler() method, it crawls starting from 20 pages and then plots the lengths of all of the paths. This generally works but I just want to confirm that the code looks clean/intuitive.
Points of concern: As there are a vast amount of edge cases, I have multiple try/except blocks. Does this look unwieldy?
Also, the point of graphing the path lengths is to try to see what kind of distribution the path lengths form. If the graph looks like it's 'normal' can I assume normality? Or is there a better way to do this (an automated way)?
import requests from lxml.html import fromstring import json from bs4 import BeautifulSoup,NavigableString, Tag import sys import matplotlib.pyplot as plt import numpy as np reload(sys) sys.setdefaultencoding('utf-8') class Crawler(): ''' Class used to crawl wikipedia pages starting from a random article. ''' def __init__(self): self.baseUrl = "https://en.wikipedia.org" def reformatString(self,char,word): '''Remove passed in char from a string and convert its characters to lowercase ''' word = word.lower() charIdx = word.find(char) if charIdx != -1: return word[:charIdx] return word def checkNameMatch(self,heading,string): '''Determine whether or not any part of the article heading is in the string and vice versa ''' for i in range(len(string)): for j in range(len(heading)): if heading[j] in string[i] or string[i] in heading[j]: return True return False def tokenize(self, word): '''Split the passed in 'word' on space characters and return a list of tokens ''' tokens = [] currWord = "" for i in range(len(word)): if word[i] == " " and i == len(word)-1: tokens.append(word.strip(" ")) return tokens currWord += word[i] if word[i] == " " : tokens.append(currWord) currWord = "" i+=1 if i == len(word)-1: tokens.append(currWord) return tokens def getValidLink(self, currResponse): '''Takes an html response and returns the first link in the main body of the article ''' currRoot = BeautifulSoup(currResponse.text,"lxml") first = currRoot.select_one("#mw-content-text") #locate main body par = first.find_all("p",recursive = False,limit = 10) heading = currRoot.select_one("#firstHeading").text heading = self.reformatString('(',heading) firstParagraphFound = False headTokens = self.tokenize(heading) #Find which paragraph has the first link i = 0 for i in range(len(par)): if par[i].b != None: bold = "" for string in par[i].find_all("b"): bold += " " + string.text bold = self.reformatString('(', bold) boldTokens = self.tokenize(bold) headingMatch = self.checkNameMatch(headTokens,boldTokens) if headingMatch: firstParagraphFound = True if headingMatch and par[i].a: break if par[i].a != None: anchor = par[i].a.text if anchor: anchor = self.reformatString('(', anchor) aTokens = self.tokenize(anchor) headingMatch = self.checkNameMatch(headTokens,aTokens) if headingMatch: break if firstParagraphFound and par[i].a: break i += 1 #if none of the paragraphs have a link and article contains only a list if i >= len(par)-1 and firstParagraphFound: ulist = first.find_all('ul') try: return ulist[0].li.a.attrs['href'] except (IndexError, AttributeError): return None elif i >= len(par)-1: print "\nReached article with no main body\n" return None mainBodyIdx = i stack = [] #Find the first link before or after parentheses for child in par[mainBodyIdx].children: if isinstance(child,NavigableString): if "(" in child: stack.append("(") if ")" in child: try: stack.pop() except IndexError: print "html malformed" return None if isinstance(child, Tag) and child.name == "a" and not stack: link = child.attrs['href'] link = self.reformatString('#',link) try: return str(link) except KeyError: print "\nReached article with no main body\n" return None def crawlToPhilosophy(self, startUrl): '''Follow the path of each url until the philosophy page is reached and return the path. ''' linkPath = [] #Get first link try: initResponse = requests.get(startUrl) except requests.exceptions.RequestException as e: print "bad link: " + str(e) return None initLink = self.getValidLink(initResponse) if not initLink: return None linkPath.append(self.baseUrl +initLink) #Follow path of links until the philosophy page is reached i = 0 while True: if "philosophy" in linkPath[i].lower(): break try: currResponse = requests.get(linkPath[i]) except requests.exceptions.RequestException as e: print "bad link: " + str(e) return None currLink = self.getValidLink(currResponse) if not currLink: return None newLink = self.baseUrl + currLink for i in range(len(linkPath)): if newLink in linkPath[i] : #loop found print "loop found!" return None linkPath.append(newLink) i += 1 return linkPath def testCrawl(self,url): '''Find paths starting from 20 different links''' i = 0 crawlList = [] while i < 20: path = self.crawlToPhilosophy(url) if path != None: crawlList.append(len(path)) i += 1 self.plotLengths(crawlList) def plotLengths(self,lens): '''Plot the distribution of path lengths''' idxs = [x for x in range(len(lens))] bins = [0,2,4,6,8,10,12,14,16] plt.hist(lens,bins,histtype = 'bar',rwidth = 0.6) plt.xlabel('x') plt.ylabel('Path Lengths') plt.title('Distribution of path lengths') plt.legend() plt.show() if __name__ == "__main__": url = "https://en.wikipedia.org/wiki/Special:Random" crawler = Crawler() crawler.testCrawl(url)