anshu-krishna
diff --git a/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎DOC.md‎
Lines changed: 156 additions & 0 deletions b/‎DOC.md‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 25 additions & 139 deletions b/‎README.md‎
Lines changed: 25 additions & 139 deletions
@@ -0,0 +1,2 @@
+* linguist-vendored
+*.php linguist-vendored=false
@@ -0,0 +1,156 @@
+# Class HTML_Scraper
+### Static Functions:
+-`new_from($source)`
+
+Create a new HTML_Scraper object from the passed source. 
+`$source` can be of type `DOMNodeList`, `DOMNode` or `string`.
+
+**Returns:** 
+| Type | Description |
+|------|-------------|
+| `array` | When `$source` is an instance of `DOMNodeList` then returns an `array` of `HTML_Scraper` objects. |
+| `HTML_Scraper` | When `$source` is an instance of `DOMNode` or a `string` |
+
+
+-`CSS_to_Xpath(string $path) : string`
+
+Translates CSS selector to XPath expression.
+
+### Functions:
+-`__toString() : string`
+
+Magic function to convert `HTML_Scraper` into a `string` containing the HTML code of the loaded document.
+
+
+-`textContent() : string`
+
+Get the *textContent* of the loaded HTML document.
+
+
+-`load_HTML_str(string $source, int $options = NULL) : bool`
+
+Load HTML from a string.
+
+-	`$options` 
+It is for passing LIBXML constant flags. `LIBXML_NOERROR | LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED` is always applied (even when `$options` is `NULL`).
+
+Returns `TRUE` on success and `FALSE` on failure.
+
+
+-`load_HTML_file(string $filename, int $options = NULL, array $context = NULL) : bool`
+
+Load HTML from a file.
+
+-	`$options` 
+*see `$options` in `HTML_Scraper->load_HTML_str()`*
+
+-	`$context` 
+*see `$context` in `stream_context_create()`*
+
+Returns `TRUE` on success and `FALSE` on failure.
+
+
+-`xpath(string $expr, int ...$items)`
+
+Get `DOMNode` that match the passed *XPath* path expression.
+
+-	`$items` 
+Index of the `DOMNode` to be returned in the `DOMNodeList` matching the *XPath* path expression. 
+It is 0-indexed. (*i.e.* to get first node use `0`, for second node use `1` and so on). 
+Negative values can be used for referencing the list item from the end. (*i.e.* use `-1` for last node, `-2` for second last node and so on). 
+If invalid index is used `NULL` is returned. (*i.e.* if only two nodes match the *XPath* path expression then using 3 will return `NULL`).
+
+**Returns:** 
+| Type | Description |
+|------|-------------|
+| `NULL` | When no nodes matches the XPath path expression |
+| `DOMNodeList` | When no `...$items` are passed |
+| `DOMNode` | When only one `...$items` is passed |
+| `array` | When more than one `...$items` are passed. Array contains `DOMNode` or `NULL` |
+
+Returns `DOMNodeList` (or `DOMNode` when `$item` index is specified) that matches the specified *XPath* path expression.
+
+
+-`querySelector(string $selector, int ...$items)`
+
+Same as `HTML_Scraper->xpath()` except that it uses CSS selector instead of *XPath* path expression.
+
+-`xpath_extract($mapper, string $expr, int ...$items)`
+
+Find `DOMNode`(s) in the same way as in `HTML_Scraper->xpath()` then extract data from the `DOMNode`(s) as specified by the `$mapper`.
+
+-	`$mapper` 
+It can be any one of the `string` specified below or a `function` that takes a `DOMNode` and returns any extracted value. 
+| Mapper Value | Description |
+|---|---|
+| `'innerHTML'` | Maps `DOMNode` to its *innerHTML* |
+| `'outerHTML'` | Maps `DOMNode` to its *outerHTML* |
+| `'textContent'` | Maps `DOMNode` to its *textContent* |
+| `'textContentTrim'` | Maps `DOMNode` to its *textContent* without any whitespaces at the beginning or at the end of the *textContent* |
+
+-`querySelector_extract($mapper, string $selector, int ...$items)`
+
+Same as `HTML_Scraper->xpath_extract()` except that it uses CSS selector instead of *XPath* path expression.
+
+---
+
+# Class DOMNodeHelper
+
+### Static Functions:
+
+-`innerHTML(DOMNode &$node) : string`
+
+Returns *innerHTML* of the passed `DOMNode`.
+
+
+-`outerHTML(DOMNode &$node) : string`
+
+Returns *outerHTML* of the passed `DOMNode`.
+
+
+-`xpath(DOMNode &$node, string $expr, int ...$items)`
+
+Similar to `HTML_Scraper->xpath()` except that it works on a `DOMNode` instead of the `HTML_Scraper`'s `DOMDocument`.
+
+-`querySelector(DOMNode &$node, string $selector, int ...$items)`
+
+Similar to `DOMNodeHelper::xpath()` except it uses CSS selector instead of a *XPath* path expression.
+
+-`getChildNode(DOMNode &$node, int ...$indexes)`
+
+Get one or more child nodes of the `DOMNode`.
+
+-	`$indexes` 
+*See `$items` in `HTML_Scraper->expath()`.*
+
+**Returns:**
+
+| Type | Description |
+|---|---|
+| `DOMNodeList` | When no `...$indexes` is passed |
+| `DOMNode` | When only one `...$indexes` is passed |
+| `array` | When more that one `...$indexes` is passed. Array contains `DOMNode` or `NULL` |
+
+
+-`getChildElements(DOMNode &$node, int ...$indexes) : array`
+
+Same as `DOMNode::getChildNode()` except that it works on child **elements** instead of child **nodes**.
+
+-`remove_self(DOMNode &$node)`
+
+Removes the `DOMNode` from its parent `DOMDocument`.
+
+-`filter_child_elements_xpath(DOMNode &$node, string ...$exprs)`
+
+Removes the child elements of the passed `DOMNode` that match the passed *XPath* path expression(s).
+
+-`filter_child_elements_querySelector(DOMNode &$node, string ...$selectors)`
+
+Removes the child elements of the passed `DOMNode` that match the passed CSS selector(s).
+
+-`filter_child_elements_index(DOMNode &$node, int ...$indexes)`
+
+Removes the child elements of the passed `DOMNode` specified by the `...$indexes`.
+
+-	`$indexes` 
+*See `$items` in `HTML_Scraper->expath()`.*
@@ -1,160 +1,46 @@
 # HTML Scraper
-A PHP class to simplify data extraction from HTML.
+A set of PHP classes to simplify data extraction from HTML.
 
 ---
 
->Base code for the *CSS_to_Xpath* method in *HTML_Scraper* was cloned from [https://github.com/zendframework/zend-dom](https://github.com/zendframework/zend-dom).
->
+>Base code for the *CSS_to_Xpath* method in *HTML_Scraper* was cloned from [https://github.com/zendframework/zend-dom](https://github.com/zendframework/zend-dom). 
 >Zend Framework
->: [http://framework.zend.com/](http://framework.zend.com/)
->
+>: [http://framework.zend.com/](http://framework.zend.com/) 
 >Repository
->: [http://github.com/zendframework/zf2](http://github.com/zendframework/zf2)
->
->Copyright (c) 2005-2015 Zend Technologies USA Inc. [http://www.zend.com](http://www.zend.com)
->
+>: [http://github.com/zendframework/zf2](http://github.com/zendframework/zf2) 
+>Copyright (c) 2005-2015 Zend Technologies USA Inc. [http://www.zend.com](http://www.zend.com) 
 >License
 >: [https://framework.zend.com/license](https://framework.zend.com/license) New BSD License
 ---
-## Static methods:
----
--`CSS_to_Xpath(string $selector) : string`
-
-Translate *CSS* selector to *XPath* path query.
-
-*Returns:*
--	`string` containing the equivalent *XPath* path query.
----
--`from($source [, bool $utf = TRUE])`
-
-Create new `HTML_Scraper` object from various sources.
-
-`$source` can be of type
--	`DOMNodeList`
--	`DOMNode`
--	`string` containing HTML
-
-*Returns:*
--	`array` of `HTML_Scraper` objects when `$source instanceof DOMNodeList`
--	`HTML_Scraper` object when `$source instanceof DOMNode`
--	`HTML_Scraper` object when `$source` is `string`
----
- -`outerHTML(DOMNode $node) : string`
-
-Extract *outerHTML* from a `DOMNode`
 
-*Returns:*
--	`string` containing *outerHTML* of the `DOMNode`
----
--`innerHTML(DOMNode $node) : string`
-
-Extract *innerHTML* from a `DOMNode`
-
-*Returns:*
--	`string` containing *innerHTML* of the `DOMNode`
----
-## Methods:
----
--`__toString() : string`
-
-***Magic*** method to convert `HTML_Scraper` object to HTML `string`.
----
--`from_querySelector(string $selector, int $item = NULL, bool $utf = TRUE)`
+For *basic* documentation see the DOC file.
 
-Create `HTML_Scraper` object (or `array` of objects) from `DOMNode` (or `DOMNodeList`) that matches the specified *CSS* selector.
-
-Returns `NULL` when no match is found.
----
--`from_xpath(string $expr, int $item = NULL, bool $utf = TRUE)`
-
-Create `HTML_Scraper` object (or `array` of objects) from `DOMNode` (or `DOMNodeList`) that matches the specified *XPath* path expression.
-
-Returns `NULL` when no match is found.
----
--`getBody() : string`
-
-Get *innerHTML* of `document.body`
----
--`getHead() : string`
-
-Get *innerHTML* of `document.head`
----
--`load_HTML_file(string $filename, bool $utf = TRUE, resource $context = NULL) : bool`
-
-Load *HTML* text from local or remote file.
-
-Returns `TRUE` on success and `FALSE` on failure.
----
--`load_HTML_str(string $source, bool $utf = TRUE) : bool`
-
-Load *HTML* text from `string`.
-
-Returns `TRUE` on success and `FALSE` on failure.
----
--`querySelector(string $selector, int $item = NULL)`
-
-Returns `DOMNodeList` (or `DOMNode` when `$item` index is specified) that matches the specified *CSS* selector.
-
-`$item` is *0-indexed*.
-
-Returns `NULL` when no match is found.
----
--`querySelector_innerHTML(string $expr, int $item = 0)`
-
-Returns *innerHTML* of the `DOMNode` that matches the specified *CSS* selector.
-
-Returns `NULL` when no match is found.
----
--`querySelector_outerHTML(string $expr, int $item = 0)`
-
-Returns *outerHTML* of the `DOMNode` that matches the specified *CSS* selector.
-
-Returns `NULL` when no match is found.
----
--`querySelector_textContent(string $expr, int $item = 0)`
-
-Returns *textContent* of the `DOMNode` that matches the specified *CSS* selector.
-
-Returns `NULL` when no match is found.
----
--`xpath(string $expr, int $item = NULL)`
-
-Returns `DOMNodeList` (or `DOMNode` when `$item` index is specified) that matches the specified *XPath* path expression.
-
-`$item` is *0-indexed*.
+### Example
+```php
+<?php
+require_once 'HTML_Scraper.php';
 
-Returns `NULL` when no match is found.
----
--`xpath_innerHTML(string $expr, int $item = 0)`
+$doc = new HTML_Scraper;
 
-Returns *innerHTML* of the `DOMNode` that matches the specified *XPath* path expression.
+if(!$doc->load_HTML_file('https://www.royalroad.com/fiction/10073/the-wandering-inn')) {
+echo 'Unable to load data';
+exit(1);
+}
 
-Returns `NULL` when no match is found.
----
--`xpath_outerHTML(string $expr, int $item = 0)`
+$data = [];
 
-Returns *outerHTML* of the `DOMNode` that matches the specified *XPath* path expression.
+$data['title'] = $doc->querySelector_extract('textContentTrim', 'div.fic-title h1[property="name"]', 0);
 
-Returns `NULL` when no match is found.
----
--`xpath_textContent(string $expr, int $item = 0)`
+$data['url'] = $doc->xpath_extract(function($meta) {
+return $meta->getAttribute('content');
+}, '//meta[@property="og:url"]', 0);
 
-Returns *textContent* of the `DOMNode` that matches the specified *XPath* path expression.
+$data['description'] = $doc->querySelector_extract(function(&$div) {
+return trim(DOMNodeHelper::innerHTML($div));
+}, 'div.description div[property="description"]', 0);
 
-Returns `NULL` when no match is found.
----
-## Example:
-```php
-<?php
-$doc = new HTML_Scraper;
-if($doc->load_HTML_file('sample_data_file.html') === TRUE) {
-$title = $doc->querySelector_textContent('.fic-title [property="name"]', 0);
-echo "Fiction name is {$title}.<br />",
-
-$rows = $doc->querySelector('#chapters tbody tr');
-echo "There are ", count($rows), "chapters. <br />";
+$data['tags'] = $doc->querySelector_extract('textContentTrim', 'span.tags span[property="genre"]');
 
-echo "First chapter is called", $doc->querySelector_textContent('#chapters tbody tr a', 0), "<br />";
-}
+var_dump($data);
 ?>
 ```
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+* linguist-vendored`
	`2`	`+*.php linguist-vendored=false`