|
1 | 1 | # HTML Scraper |
2 | | -A PHP class to simplify data extraction from HTML. |
| 2 | +A set of PHP classes to simplify data extraction from HTML. |
3 | 3 |
|
4 | 4 | --- |
5 | 5 |
|
6 | | ->Base code for the *CSS_to_Xpath* method in *HTML_Scraper* was cloned from [https://github.com/zendframework/zend-dom](https://github.com/zendframework/zend-dom). |
7 | | -> |
| 6 | +>Base code for the *CSS_to_Xpath* method in *HTML_Scraper* was cloned from [https://github.com/zendframework/zend-dom](https://github.com/zendframework/zend-dom). |
8 | 7 | >Zend Framework |
9 | | ->: [http://framework.zend.com/](http://framework.zend.com/) |
10 | | -> |
| 8 | +>: [http://framework.zend.com/](http://framework.zend.com/) |
11 | 9 | >Repository |
12 | | ->: [http://github.com/zendframework/zf2](http://github.com/zendframework/zf2) |
13 | | -> |
14 | | ->Copyright (c) 2005-2015 Zend Technologies USA Inc. [http://www.zend.com](http://www.zend.com) |
15 | | -> |
| 10 | +>: [http://github.com/zendframework/zf2](http://github.com/zendframework/zf2) |
| 11 | +>Copyright (c) 2005-2015 Zend Technologies USA Inc. [http://www.zend.com](http://www.zend.com) |
16 | 12 | >License |
17 | 13 | >: [https://framework.zend.com/license](https://framework.zend.com/license) New BSD License |
18 | 14 | --- |
19 | | -## Static methods: |
20 | | ---- |
21 | | --`CSS_to_Xpath(string $selector) : string` |
22 | | - |
23 | | -Translate *CSS* selector to *XPath* path query. |
24 | | - |
25 | | -*Returns:* |
26 | | -- `string` containing the equivalent *XPath* path query. |
27 | | ---- |
28 | | --`from($source [, bool $utf = TRUE])` |
29 | | - |
30 | | -Create new `HTML_Scraper` object from various sources. |
31 | | - |
32 | | -`$source` can be of type |
33 | | -- `DOMNodeList` |
34 | | -- `DOMNode` |
35 | | -- `string` containing HTML |
36 | | - |
37 | | -*Returns:* |
38 | | -- `array` of `HTML_Scraper` objects when `$source instanceof DOMNodeList` |
39 | | -- `HTML_Scraper` object when `$source instanceof DOMNode` |
40 | | -- `HTML_Scraper` object when `$source` is `string` |
41 | | ---- |
42 | | - -`outerHTML(DOMNode $node) : string` |
43 | | - |
44 | | -Extract *outerHTML* from a `DOMNode` |
45 | 15 |
|
46 | | -*Returns:* |
47 | | -- `string` containing *outerHTML* of the `DOMNode` |
48 | | ---- |
49 | | --`innerHTML(DOMNode $node) : string` |
50 | | - |
51 | | -Extract *innerHTML* from a `DOMNode` |
52 | | - |
53 | | -*Returns:* |
54 | | -- `string` containing *innerHTML* of the `DOMNode` |
55 | | ---- |
56 | | -## Methods: |
57 | | ---- |
58 | | --`__toString() : string` |
59 | | - |
60 | | -***Magic*** method to convert `HTML_Scraper` object to HTML `string`. |
61 | | ---- |
62 | | --`from_querySelector(string $selector, int $item = NULL, bool $utf = TRUE)` |
| 16 | +For *basic* documentation see the DOC file. |
63 | 17 |
|
64 | | -Create `HTML_Scraper` object (or `array` of objects) from `DOMNode` (or `DOMNodeList`) that matches the specified *CSS* selector. |
65 | | - |
66 | | -Returns `NULL` when no match is found. |
67 | | ---- |
68 | | --`from_xpath(string $expr, int $item = NULL, bool $utf = TRUE)` |
69 | | - |
70 | | -Create `HTML_Scraper` object (or `array` of objects) from `DOMNode` (or `DOMNodeList`) that matches the specified *XPath* path expression. |
71 | | - |
72 | | -Returns `NULL` when no match is found. |
73 | | ---- |
74 | | --`getBody() : string` |
75 | | - |
76 | | -Get *innerHTML* of `document.body` |
77 | | ---- |
78 | | --`getHead() : string` |
79 | | - |
80 | | -Get *innerHTML* of `document.head` |
81 | | ---- |
82 | | --`load_HTML_file(string $filename, bool $utf = TRUE, resource $context = NULL) : bool` |
83 | | - |
84 | | -Load *HTML* text from local or remote file. |
85 | | - |
86 | | -Returns `TRUE` on success and `FALSE` on failure. |
87 | | ---- |
88 | | --`load_HTML_str(string $source, bool $utf = TRUE) : bool` |
89 | | - |
90 | | -Load *HTML* text from `string`. |
91 | | - |
92 | | -Returns `TRUE` on success and `FALSE` on failure. |
93 | | ---- |
94 | | --`querySelector(string $selector, int $item = NULL)` |
95 | | - |
96 | | -Returns `DOMNodeList` (or `DOMNode` when `$item` index is specified) that matches the specified *CSS* selector. |
97 | | - |
98 | | -`$item` is *0-indexed*. |
99 | | - |
100 | | -Returns `NULL` when no match is found. |
101 | | ---- |
102 | | --`querySelector_innerHTML(string $expr, int $item = 0)` |
103 | | - |
104 | | -Returns *innerHTML* of the `DOMNode` that matches the specified *CSS* selector. |
105 | | - |
106 | | -Returns `NULL` when no match is found. |
107 | | ---- |
108 | | --`querySelector_outerHTML(string $expr, int $item = 0)` |
109 | | - |
110 | | -Returns *outerHTML* of the `DOMNode` that matches the specified *CSS* selector. |
111 | | - |
112 | | -Returns `NULL` when no match is found. |
113 | | ---- |
114 | | --`querySelector_textContent(string $expr, int $item = 0)` |
115 | | - |
116 | | -Returns *textContent* of the `DOMNode` that matches the specified *CSS* selector. |
117 | | - |
118 | | -Returns `NULL` when no match is found. |
119 | | ---- |
120 | | --`xpath(string $expr, int $item = NULL)` |
121 | | - |
122 | | -Returns `DOMNodeList` (or `DOMNode` when `$item` index is specified) that matches the specified *XPath* path expression. |
123 | | - |
124 | | -`$item` is *0-indexed*. |
| 18 | +### Example |
| 19 | +```php |
| 20 | +<?php |
| 21 | +require_once 'HTML_Scraper.php'; |
125 | 22 |
|
126 | | -Returns `NULL` when no match is found. |
127 | | ---- |
128 | | --`xpath_innerHTML(string $expr, int $item = 0)` |
| 23 | +$doc = new HTML_Scraper; |
129 | 24 |
|
130 | | -Returns *innerHTML* of the `DOMNode` that matches the specified *XPath* path expression. |
| 25 | +if(!$doc->load_HTML_file('https://www.royalroad.com/fiction/10073/the-wandering-inn')) { |
| 26 | +echo 'Unable to load data'; |
| 27 | +exit(1); |
| 28 | +} |
131 | 29 |
|
132 | | -Returns `NULL` when no match is found. |
133 | | ---- |
134 | | --`xpath_outerHTML(string $expr, int $item = 0)` |
| 30 | +$data = []; |
135 | 31 |
|
136 | | -Returns *outerHTML* of the `DOMNode` that matches the specified *XPath* path expression. |
| 32 | +$data['title'] = $doc->querySelector_extract('textContentTrim', 'div.fic-title h1[property="name"]', 0); |
137 | 33 |
|
138 | | -Returns `NULL` when no match is found. |
139 | | ---- |
140 | | --`xpath_textContent(string $expr, int $item = 0)` |
| 34 | +$data['url'] = $doc->xpath_extract(function($meta) { |
| 35 | +return $meta->getAttribute('content'); |
| 36 | +}, '//meta[@property="og:url"]', 0); |
141 | 37 |
|
142 | | -Returns *textContent* of the `DOMNode` that matches the specified *XPath* path expression. |
| 38 | +$data['description'] = $doc->querySelector_extract(function(&$div) { |
| 39 | +return trim(DOMNodeHelper::innerHTML($div)); |
| 40 | +}, 'div.description div[property="description"]', 0); |
143 | 41 |
|
144 | | -Returns `NULL` when no match is found. |
145 | | ---- |
146 | | -## Example: |
147 | | -```php |
148 | | -<?php |
149 | | -$doc = new HTML_Scraper; |
150 | | -if($doc->load_HTML_file('sample_data_file.html') === TRUE) { |
151 | | -$title = $doc->querySelector_textContent('.fic-title [property="name"]', 0); |
152 | | -echo "Fiction name is {$title}.<br />", |
153 | | - |
154 | | -$rows = $doc->querySelector('#chapters tbody tr'); |
155 | | -echo "There are ", count($rows), "chapters. <br />"; |
| 42 | +$data['tags'] = $doc->querySelector_extract('textContentTrim', 'span.tags span[property="genre"]'); |
156 | 43 |
|
157 | | -echo "First chapter is called", $doc->querySelector_textContent('#chapters tbody tr a', 0), "<br />"; |
158 | | -} |
| 44 | +var_dump($data); |
159 | 45 | ?> |
160 | 46 | ``` |
0 commit comments