14

I have a CasperJS script that scrapes ratings and dates from one webpage. Now I want to scrape the same data from multiple pages under the same website. How can I loop through the different subpages given this code:

var ratings = []; var dates = []; var casper = require('casper').create({ pageSettings: { loadImages: false, loadPlugins: false }, logLevel: "debug", verbose: true }); var fs = require('fs'); function getRatings() { var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img'); return Array.prototype.map.call(ratings, function(e) { return e.getAttribute('title'); }); } function getDate() { var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'); return Array.prototype.map.call(dates, function(e) { return e.innerHTML; }); } casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){ this.echo('hi'); }); casper.then(function() { ratings = this.evaluate(getRatings); dates = this.evaluate(getDate); this.echo(ratings); }); casper.run(function() { this.echo(ratings.length + ' ratings found:'); for(var i=0; i<ratings.length; i++){ ratings[i] = ratings[i]+': '+dates[i]; dates[i] = ''; } this.echo(ratings); var content = ratings; content = content.join("\n"); fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w'); this.echo(dates.length + ' dates found:').exit(); }); 

Any help is appreciated :)

1

3 Answers 3

28

Since there exists a next page button, you can use it to traverse all pages recursively:

function getRatingsAndWrite(){ ratings = casper.evaluate(getRatings); dates = casper.evaluate(getDate); casper.echo(ratings); casper.echo(ratings.length + ' ratings found:'); for(var i=0; i<ratings.length; i++){ ratings[i] = ratings[i]+': '+dates[i]; dates[i] = ''; } casper.echo(ratings); var content = ratings; content = content.join("\n"); fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a'); casper.echo(dates.length + ' dates found:'); var nextLink = ".BVRRPageLink.BVRRNextPage > a"; if (casper.visible(nextLink)) { casper.thenClick(nextLink); casper.then(getRatingsAndWrite); } else { casper.echo("END") } } casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm'); casper.then(getRatingsAndWrite); casper.run(); 

A related answer is A: CasperJS parse next page after button click.

Sign up to request clarification or add additional context in comments.

Comments

6

This code can help you : you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.

You can use a click method in the loop instead of url too.

var navigation = [ { url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate' } ,{ url: 'yourSecondUrl, etc...', selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate' } ], content = ""; casper.start() .then(function(){ //loop on the array navigation.forEach(function(navIndex){ //open url : property url casper.thenOpen(navIndex.url) //wait for the page to load -> must be useless because thenOpen() do it .waitForUrl(navIndex.url, function(){ //get the value of attribute title of adequate selector var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'), //get the HTML of adequate selector var dates = this.getHTML(navIndex.selectorDates); this.echo(ratings); this.echo(dates); content = content + ' ' + ratings + ' ' + dates; }); }); }) .run(function() { this.echo('----------- All steps done ------------\n'); this.exit(); }); 

4 Comments

The problem is that there are more pages than page links, therefore you need to do this recursively.
Yes, i didn't look at his url before posting, but he still can combine a click() + wait(). With a counter.
Yes, it would be possible, but you need to nest the steps at least once, because you cannot schedule the click+wait without knowing the number of pages.
Unfortunately, this is not working for me. I got Wait timeout of 5000ms expired, exiting. I tried to change it to 20s, but still the same error.
2

Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:

var ratings = []; var dates = []; var casper = require('casper').create({ pageSettings: { loadImages: false, loadPlugins: false }, logLevel: "debug", verbose: true }); var fs = require('fs'); function getRatings() { var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img'); return Array.prototype.map.call(ratings, function(e) { return e.getAttribute('title'); }); } function getDate() { var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'); return Array.prototype.map.call(dates, function(e) { return e.innerHTML; }); } function getRatingsAndWrite(){ ratings = casper.evaluate(getRatings); dates = casper.evaluate(getDate); casper.echo(ratings.length + ' ratings found:'); for(var i=0; i<ratings.length; i++){ var rating = ratings[i].substr(0,1); ratings[i] = rating +': '+dates[i]; dates[i] = ''; } var content = ratings; content = content.join("\n"); fs.write("<filepath to write content>", content, 'a'); casper.echo(dates.length + ' dates found:'); var nextLink = ".BVRRPageLink.BVRRNextPage > a"; if (casper.visible(nextLink)) { casper.thenClick(nextLink); casper.wait(3000); casper.then(getRatingsAndWrite); } else { casper.echo("END") } } casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html'); casper.then(getRatingsAndWrite); casper.run(); 

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.