I wrote a little crawler and I am wondering how to properly assign the results to the instance being called.
My constructor sets up some basic properties and calls the next method which contains an if loop which might call a foreach loop. When all is done I echo my results.
This works perfectly fine but I don't want to echo my json_encode data. I rather want my $crawler variable at the bottom to contain the json_encode data.
This is my code:
<?php class Crawler { private $url; private $class; private $regex; private $htmlStack; private $pageNumber = 1; private $elementsArray; public function __construct($url, $class, $regex=null) { $this->url = $url; $this->class = $class; $this->regex = $regex; $this->curlGet($this->url); } private function curlGet($url) { $curl = curl_init(); curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($curl, CURLOPT_URL, $url); $this->htmlStack .= curl_exec($curl); $response = curl_getinfo($curl, CURLINFO_HTTP_CODE); $this->paginate($response); } private function paginate($response) { if($response === 200) { $this->pageNumber++; $url = $this->url . '?page=' . $this->pageNumber; $this->curlGet($url); } else { $this->CreateDomDocument(); } } private function curlGetDeep($link) { $curl = curl_init(); curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($curl, CURLOPT_URL, $link); $product = curl_exec($curl); $dom = new Domdocument(); @$dom->loadHTML($product); $xpath = new DomXpath($dom); $descriptions = $xpath->query('//div[contains(@class, "description")]'); foreach($descriptions as $description) { return $description->nodeValue; } } private function CreateDomDocument() { $dom = new Domdocument(); @$dom->loadHTML($this->htmlStack); $xpath = new DomXpath($dom); $elements = $xpath->query('//article[contains(@class, "' . $this->class . '")]'); foreach($elements as $element) { $title = $xpath->query('descendant::div[@class="title"]', $element); $title = $title->item(0)->nodeValue; $link = $xpath->query('descendant::a[@class="link-overlay"]', $element); $link = $link->item(0)->getAttribute('href'); $link = 'https://www.gall.nl' . $link; $image = $xpath->query('descendant::div[@class="image"]/node()/node()', $element); $image = $image->item(1)->getAttribute('src'); $description = $this->curlGetDeep($link); if($this->regex) { $title = preg_replace($this->regex, '', $title); } if(!preg_match('/\dX(\d+)?/', $title)) { $this->elementsArray[] = [ 'title' => $title, 'link' => $link, 'image' => $image, 'description' => $description ]; } } echo json_encode(['beers' => $this->elementsArray]); } } $crawler = new Crawler('https://www.gall.nl/shop/speciaal-bier/', 'product-block', '/\d+\,?\d*CL/i'); Github link for some more overview: https://github.com/stephan-v/crawler/blob/master/ArticleCrawler.php
Hopefully somebody can help me out since I am a bit confused here on how to go about getting this working properly.