=item *
C<$context_ns> - context tag namespace, default C
=item *
C<$options> - parser options
See L<"PARSER OPTIONS"> for details.
=back
# simple create new fragment
my $node = $tree->parseFragment("some
bold and
italic text");
# create new fragment node with custom context tag/namespace and options
my $node = $tree->parseFragment("some
bold and
italic text", "div", "html", {
# some options override
encoding => "windows-1251"
});
print $node->html; # some
bold and
italic text
=head3 document
my $node = $tree->document;
Return L
node of current tree;
=head3 root
my $node = $tree->root;
Return root node of current tree. (always )
=head3 head
my $node = $tree->head;
Return node of current tree.
=head3 body
my $node = $tree->body;
Return node of current tree.
=head3 at
=head3 querySelector
my $node = $tree->at($selector);
my $node = $tree->querySelector($selector); # alias
Find one element node in tree using L
Return node, or C if not find.
=over
=item *
C<$selector> - selector query as plain text or precompiled as L or
L.
=back
my $tree = HTML5::DOM->new->parse('red
blue
')
my $node = $tree->at('body > div.red');
print $node->html; # red
=head3 find
=head3 querySelectorAll
my $collection = $tree->find($selector);
my $collection = $tree->querySelectorAll($selector); # alias
Find all element nodes in tree using L
Return L.
=over
=item *
C<$selector> - selector query as plain text or precompiled as L or
L.
=back
my $tree = HTML5::DOM->new->parse('red
blue
')
my $collection = $tree->at('body > div.red, body > div.blue');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
=head3 findId
=head3 getElementById
my $collection = $tree->findId($tag);
my $collection = $tree->getElementById($tag); # alias
Find element node with specified id.
Return L or C.
my $tree = HTML5::DOM->new->parse('red
blue
')
my $node = $tree->findId('test');
print $node->html; # blue
=head3 findTag
=head3 getElementsByTagName
my $collection = $tree->findTag($tag);
my $collection = $tree->getElementsByTagName($tag); # alias
Find all element nodes in tree with specified tag name.
Return L.
my $tree = HTML5::DOM->new->parse('red
blue
')
my $collection = $tree->findTag('div');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
=head3 findClass
=head3 getElementsByClassName
my $collection = $tree->findClass($class);
my $collection = $tree->getElementsByClassName($class); # alias
Find all element nodes in tree with specified class name.
This is more fast equivalent to [class~="value"] selector.
Return L.
my $tree = HTML5::DOM->new
->parse('red
blue
');
my $collection = $tree->findClass('color');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
=head3 findAttr
=head3 getElementByAttribute
# Find all elements with attribute
my $collection = $tree->findAttr($attribute);
my $collection = $tree->getElementByAttribute($attribute); # alias
# Find all elements with attribute and mathcing value
my $collection = $tree->findAttr($attribute, $value, $case = 0, $cmp = '=');
my $collection = $tree->getElementByAttribute($attribute, $value, $case = 0, $cmp = '='); # alias
Find all element nodes in tree with specified attribute and optional matching value.
Return L.
my $tree = HTML5::DOM->new
->parse('red
blue
');
my $collection = $tree->findAttr('class', 'CoLoR', 1, '~');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
CSS selector analogs:
# [$attribute=$value]
my $collection = $tree->findAttr($attribute, $value, 0, '=');
# [$attribute=$value i]
my $collection = $tree->findAttr($attribute, $value, 1, '=');
# [$attribute~=$value]
my $collection = $tree->findAttr($attribute, $value, 0, '~');
# [$attribute|=$value]
my $collection = $tree->findAttr($attribute, $value, 0, '|');
# [$attribute*=$value]
my $collection = $tree->findAttr($attribute, $value, 0, '*');
# [$attribute^=$value]
my $collection = $tree->findAttr($attribute, $value, 0, '^');
# [$attribute$=$value]
my $collection = $tree->findAttr($attribute, $value, 0, '$');
=head3 encoding
=head3 encodingId
print "encoding: ".$tree->encoding."\n"; # UTF-8
print "encodingId: ".$tree->encodingId."\n"; # 0
Return current tree encoding. See L<"ENCODINGS"> for details.
=head3 tag2id
print "tag id: ".HTML5::DOM->TAG_A."\n"; # tag id: 4
print "tag id: ".$tree->tag2id("a")."\n"; # tag id: 4
Convert tag name to id. Return 0 (HTML5::DOM->TAG__UNDEF), if tag not exists in tree.
See L<"TAGS"> for tag constants list.
=head3 id2tag
print "tag name: ".$tree->id2tag(4)."\n"; # tag name: a
print "tag name: ".$tree->id2tag(HTML5::DOM->TAG_A)."\n"; # tag name: a
Convert tag id to name. Return C, if tag id not exists in tree.
See L<"TAGS"> for tag constants list.
=head3 namespace2id
print "ns id: ".HTML5::DOM->NS_HTML."\n"; # ns id: 1
print "ns id: ".$tree->namespace2id("html")."\n"; # ns id: 1
Convert namespace name to id. Return 0 (HTML5::DOM->NS_UNDEF), if namespace not exists in tree.
See L<"NAMESPACES"> for namespace constants list.
=head3 id2namespace
print "ns name: ".$tree->id2namespace(1)."\n"; # ns name: html
print "ns name: ".$tree->id2namespace(HTML5::DOM->NS_HTML)."\n"; # ns name: html
Convert namespace id to name. Return C, if namespace id not exists.
See L<"NAMESPACES"> for namespace constants list.
=head3 parser
my $parser = $tree->parser;
Return parent L.
=head3 utf8
As getter - get C<1> if all methods returns all strings with utf8 flag.
Example with utf8:
use warnings;
use strict;
use HTML5::DOM;
use utf8;
my $tree = HTML5::DOM->new->parse("тест ");
my $is_utf8_enabled = $tree->utf8;
print "is_utf8_enabled=".($tree ? "true" : "false")."\n"; # true
Or example with bytes:
use warnings;
use strict;
use HTML5::DOM;
my $tree = HTML5::DOM->new->parse("тест ");
my $is_utf8_enabled = $tree->utf8;
print "is_utf8_enabled=".($tree ? "true" : "false")."\n"; # false
As setter - enable or disable utf8 flag on all returned strings.
use warnings;
use strict;
use HTML5::DOM;
use utf8;
my $tree = HTML5::DOM->new->parse("тест ");
print "is_utf8_enabled=".($tree->utf8 ? "true" : "false")."\n"; # true
print length($tree->at('b')->text)." chars\n"; # 4 chars
$selector->utf8(0);
print "is_utf8_enabled=".($tree->utf8 ? "true" : "false")."\n"; # false
print length($tree->at('b')->text)." bytes\n"; # 8 bytes
=head1 HTML5::DOM::Node
DOM node object.
=head3 tag
=head3 nodeName
my $tag_name = $node->tag;
my $tag_name = $node->nodeName; # uppercase
my $tag_name = $node->tagName; # uppercase
Return node tag name (eg. div or span)
$node->tag($tag);
$node->nodeName($tag); # alias
$node->tagName($tag); # alias
Set new node tag name. Allow only for L nodes.
print $node->html; #
$node->tag('span');
print $node->html; #
print $node->tag; # span
print $node->tag; # SPAN
=head3 tagId
my $tag_id = $node->tagId;
Return node tag id. See L<"TAGS"> for tag constants list.
$node->tagId($tag_id);
Set new node tag id. Allow only for L nodes.
print $node->html; #
$node->tagId(HTML5::DOM->TAG_SPAN);
print $node->html; #
print $node->tagId; # 117
=head3 namespace
my $tag_ns = $node->namespace;
Return node namespace (eg. html or svg)
$node->namespace($namespace);
Set new node namespace name. Allow only for L nodes.
print $node->namespace; # html
$node->namespace('svg');
print $node->namespace; # svg
=head3 namespaceId
my $tag_ns_id = $node->namespaceId;
Return node namespace id. See L<"NAMESPACES"> for tag constants list.
$node->namespaceId($tag_id);
Set new node namespace by id. Allow only for L nodes.
print $node->namespace; # html
$node->namespaceId(HTML5::DOM->NS_SVG);
print $node->namespaceId; # 3
print $node->namespace; # svg
=head3 tree
my $tree = $node->tree;
Return parent L.
=head3 nodeType
my $type = $node->nodeType;
Return node type. All types:
HTML5::DOM->ELEMENT_NODE => 1,
HTML5::DOM->ATTRIBUTE_NODE => 2, # not supported
HTML5::DOM->TEXT_NODE => 3,
HTML5::DOM->CDATA_SECTION_NODE => 4, # not supported
HTML5::DOM->ENTITY_REFERENCE_NODE => 5, # not supported
HTML5::DOM->ENTITY_NODE => 6, # not supported
HTML5::DOM->PROCESSING_INSTRUCTION_NODE => 7, # not supported
HTML5::DOM->COMMENT_NODE => 8,
HTML5::DOM->DOCUMENT_NODE => 9,
HTML5::DOM->DOCUMENT_TYPE_NODE => 10,
HTML5::DOM->DOCUMENT_FRAGMENT_NODE => 11,
HTML5::DOM->NOTATION_NODE => 12 # not supported
Compatible with: L
=head3 next
=head3 nextElementSibling
my $node2 = $node->next;
my $node2 = $node->nextElementSibling; # alias
Return next sibling element node
my $tree = HTML5::DOM->new->parse('
');
my $li = $tree->at('ul li');
print $li->text; # Linux
print $li->next->text; # OSX
print $li->next->next->text; # Windows
=head3 prev
=head3 previousElementSibling
my $node2 = $node->prev;
my $node2 = $node->previousElementSibling; # alias
Return previous sibling element node
my $tree = HTML5::DOM->new->parse('
');
my $li = $tree->at('ul li.win');
print $li->text; # Windows
print $li->prev->text; # OSX
print $li->prev->prev->text; # Linux
=head3 nextNode
=head3 nextSibling
my $node2 = $node->nextNode;
my $node2 = $node->nextSibling; # alias
Return next sibling node
my $tree = HTML5::DOM->new->parse('
');
my $li = $tree->at('ul li');
print $li->text; # Linux
print $li->nextNode->text; #
print $li->nextNode->nextNode->text; # OSX
=head3 prevNode
=head3 previousSibling
my $node2 = $node->prevNode;
my $node2 = $node->previousSibling; # alias
Return previous sibling node
my $tree = HTML5::DOM->new->parse('
');
my $li = $tree->at('ul li.win');
print $li->text; # Windows
print $li->prevNode->text; # OSX
print $li->prevNode->prevNode->text; #
=head3 first
=head3 firstElementChild
my $node2 = $node->first;
my $node2 = $node->firstElementChild; # alias
Return first children element
my $tree = HTML5::DOM->new->parse('
');
my $ul = $tree->at('ul');
print $ul->first->text; # Linux
=head3 last
=head3 lastElementChild
my $node2 = $node->last;
my $node2 = $node->lastElementChild; # alias
Return last children element
my $tree = HTML5::DOM->new->parse('
');
my $ul = $tree->at('ul');
print $ul->last->text; # Windows
=head3 firstNode
=head3 firstChild
my $node2 = $node->firstNode;
my $node2 = $node->firstChild; # alias
Return first children node
my $tree = HTML5::DOM->new->parse('
');
my $ul = $tree->at('ul');
print $ul->firstNode->html; #
=head3 lastNode
=head3 lastChild
my $node2 = $node->lastNode;
my $node2 = $node->lastChild; # alias
Return last children node
my $tree = HTML5::DOM->new->parse('
');
my $ul = $tree->at('ul');
print $ul->lastNode->html; #
=head3 html
Universal html serialization and fragment parsing acessor, for single human-friendly api.
my $html = $node->html();
my $node = $node->html($new_html);
=over
=item *
As getter this similar to L
=item *
As setter this similar to L
=item *
As setter for non-element nodes this similar to L
=back
my $tree = HTML5::DOM->new->parse('some text bold
');
# get text content for element
my $node = $tree->at('#test');
print $node->html; # some text bold
$comment->html('new ');
print $comment->html; # new
my $comment = $tree->createComment(" comment text ");
print $comment->html; #
$comment->html(' new comment text ');
print $comment->html; #
my $text_node = $tree->createTextNode("plain text >");
print $text_node->html; # plain text >
$text_node->html('new>plain>text');
print $text_node->html; # new>plain>text
=head3 innerHTML
=head3 outerHTML
=over
=item *
HTML serialization of the node's descendants.
my $html = $node->html;
my $html = $node->outerHTML;
Example:
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->getElementById('test')->outerHTML; # some bold test
print $tree->createComment(' test ')->outerHTML; #
print $tree->createTextNode('test')->outerHTML; # test
=item *
HTML serialization of the node and its descendants.
# serialize descendants, without node
my $html = $node->innerHTML;
Example:
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->getElementById('test')->innerHTML; # some bold test
print $tree->createComment(' test ')->innerHTML; #
print $tree->createTextNode('test')->innerHTML; # test
=item *
Removes all of the element's descendants and replaces them with nodes constructed by parsing the HTML given in the string B<$new_html>.
# parse fragment and replace child nodes with it
my $html = $node->html($new_html);
my $html = $node->innerHTML($new_html);
Example:
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->at('#test')->innerHTML('italic ');
print $tree->body->innerHTML; # italic
=item *
HTML serialization of entire document
my $html = $tree->document->html;
my $html = $tree->document->outerHTML;
Example:
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->document->outerHTML; # some bold test
=item *
Replaces the element and all of its descendants with a new DOM tree constructed by parsing the specified B<$new_html>.
# parse fragment and node in parent node childs with it
my $html = $node->outerHTML($new_html);
Example:
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->at('#test')->outerHTML('italic ');
print $tree->body->innerHTML; # italic
=back
See, for more info:
L
L
=head3 text
Universal text acessor, for single human-friendly api.
my $text = $node->text();
my $node = $node->text($new_text);
=over
=item *
For L is similar to L (as setter/getter)
=item *
For L is similar to L (as setter/getter)
=item *
For L is similar to L (as setter/getter)
=item *
For L is similar to L (as setter/getter)
=back
my $tree = HTML5::DOM->new->parse('some text bold
');
# get text content for element
my $node = $tree->at('#test');
print $node->text; # some text bold
$comment->text('');
print $comment->html; # <new node conten>
my $comment = $tree->createComment("comment text");
print $comment->text; # comment text
$comment->text(' new comment text ');
print $comment->html; #
my $text_node = $tree->createTextNode("plain text");
print $text_node->text; # plain text
$text_node->text('new>plain>text');
print $text_node->html; # new>plain>text
=head3 innerText
=head3 outerText
=head3 textContent
=over
=item *
Represents the "rendered" text content of a node and its descendants.
Using default CSS "display" property for tags based on Firefox user-agent style.
Only works for elements, for other nodes return C.
my $text = $node->innerText;
my $text = $node->outerText; # alias
Example:
my $tree = HTML5::DOM->new->parse('
some
bold
test
');
print $tree->body->innerText; # some bold test
See, for more info: L
=item *
Removes all of its children and replaces them with a text nodes and with the given value.
Only works for elements, for other nodes throws exception.
=over
=item *
All new line chars (\r\n, \r, \n) replaces to
=item *
All other text content replaces to text nodes
=back
my $node = $node->innerText($text);
Example:
my $tree = HTML5::DOM->new->parse('some text bold
');
$tree->at('#test')->innerText("some\nnew\ntext >");
print $tree->at('#test')->html; # some new text >
See, for more info: L
=item *
Removes the current node and replaces it with the given text.
Only works for elements, for other nodes throws exception.
=over
=item *
All new line chars (\r\n, \r, \n) replaces to
=item *
All other text content replaces to text nodes
=item *
Similar to innerText($text), but removes current node
=back
my $node = $node->outerText($text);
Example:
my $tree = HTML5::DOM->new->parse('some text bold
');
$tree->at('#test')->outerText("some\nnew\ntext >");
print $tree->body->html; # some new text >
See, for more info: L
=item *
Represents the text content of a node and its descendants.
Only works for elements, for other nodes return C.
my $text = $node->text;
my $text = $node->textContent; # alias
Example:
my $tree = HTML5::DOM->new->parse(' test ');
print $tree->body->text; # test alert()
See, for more info: L
=item *
Removes all of its children and replaces them with a single text node with the given value.
my $node = $node->text($new_text);
my $node = $node->textContent($new_text);
Example:
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->at('#test')->text('');
print $tree->at('#test')->html; # <bla bla bla>
See, for more info: L
=back
=head3 nodeHtml
my $html = $node->nodeHtml();
Serialize to html, without descendants and closing tag.
my $tree = HTML5::DOM->new->parse('some bold test
');
print $tree->at('#test')->nodeHtml(); #
=head3 nodeValue
=head3 data
my $value = $node->nodeValue();
my $value = $node->data(); # alias
my $node = $node->nodeValue($new_value);
my $node = $node->data($new_value); # alias
Get or set value of node. Only works for non-element nodes, such as L
, L,
L. Return C for other.
my $tree = HTML5::DOM->new->parse('');
my $comment = $tree->createComment("comment text");
print $comment->nodeValue; # comment text
$comment->nodeValue(' new comment text ');
print $comment->html; #
=head3 isConnected
my $flag = $node->isConnected;
Return true, if node has parent.
my $tree = HTML5::DOM->new->parse('
');
print $tree->at('#test')->isConnected; # 1
print $tree->createElement("div")->isConnected; # 0
=head3 parent
=head3 parentElement
my $node = $node->parent;
my $node = $node->parentElement; # alias
Return parent node. Return C, if node detached.
my $tree = HTML5::DOM->new->parse('
');
print $tree->at('#test')->parent->tag; # body
=head3 document
=head3 ownerDocument
my $doc = $node->document;
my $doc = $node->ownerDocument; # alias
Return parent L.
my $tree = HTML5::DOM->new->parse('
');
print ref($tree->at('#test')->document); # HTML5::DOM::Document
=head3 append
=head3 appendChild
my $node = $node->append($child);
my $child = $node->appendChild($child); # alias
Append node to child nodes.
B - returned value is the self node, for chain calls
B - returned value is the appended child except when the given child is a L,
in which case the empty L is returned.
my $tree = HTML5::DOM->new->parse('
some bold text
');
$tree->at('div')
->append($tree->createElement('br'))
->append($tree->createElement('br'));
print $tree->at('div')->html; # some bold text
=head3 prepend
=head3 prependChild
my $node = $node->prepend($child);
my $child = $node->prependChild($child); # alias
Prepend node to child nodes.
B - returned value is the self node, for chain calls
B - returned value is the prepended child except when the given child is a L,
in which case the empty L is returned.
my $tree = HTML5::DOM->new->parse('
some bold text
');
$tree->at('div')
->prepend($tree->createElement('br'))
->prepend($tree->createElement('br'));
print $tree->at('div')->html; # some bold text
=head3 replace
=head3 replaceChild
my $old_node = $old_node->replace($new_node);
my $old_node = $old_node->parent->replaceChild($new_node, $old_node); # alias
Replace node in parent child nodes.
my $tree = HTML5::DOM->new->parse('
some bold text
');
my $old = $tree->at('b')->replace($tree->createElement('br'));
print $old->html; # bold
print $tree->at('div')->html; # some text
=head3 before
=head3 insertBefore
my $node = $node->before($new_node);
my $new_node = $node->parent->insertBefore($new_node, $node); # alias
Insert new node before current node.
B - returned value is the self node, for chain calls
B - returned value is the added child except when the given child is a L,
in which case the empty L is returned.
my $tree = HTML5::DOM->new->parse('
some bold text
');
$tree->at('b')->before($tree->createElement('br'));
print $tree->at('div')->html; # some bold text
=head3 after
=head3 insertAfter
my $node = $node->after($new_node);
my $new_node = $node->parent->insertAfter($new_node, $node); # alias
Insert new node after current node.
B - returned value is the self node, for chain calls
B - returned value is the added child except when the given child is a L,
in which case the empty L is returned.
my $tree = HTML5::DOM->new->parse('
some bold text
');
$tree->at('b')->after($tree->createElement('br'));
print $tree->at('div')->html; # some bold text
=head3 remove
=head3 removeChild
my $node = $node->remove;
my $node = $node->parent->removeChild($node); # alias
Remove node from parent. Return removed node.
my $tree = HTML5::DOM->new->parse('
some bold text
');
print $tree->at('b')->remove->html; # bold
print $tree->at('div')->html; # some text
=head3 clone
=head3 cloneNode
# clone node to current tree
my $node = $node->clone($deep = 0);
my $node = $node->cloneNode($deep = 0); # alias
# clone node to foreign tree
my $node = $node->clone($deep, $new_tree);
my $node = $node->cloneNode($deep, $new_tree); # alias
Clone node.
B = 0 - only specified node, without childs.
B = 1 - deep copy with all child nodes.
B - destination tree (if need copy to foreign tree)
my $tree = HTML5::DOM->new->parse('
some bold text
');
print $tree->at('b')->clone(0)->html; #
print $tree->at('b')->clone(1)->html; # bold
=head3 void
my $flag = $node->void;
Return true if node is void. For more details: L
print $tree->createElement('br')->void; # 1
=head3 selfClosed
my $flag = $node->selfClosed;
Return true if node self closed.
print $tree->createElement('br')->selfClosed; # 1
=head3 position
my $position = $node->position;
Return offsets in input buffer.
print Dumper($node->position);
# $VAR1 = {'raw_length' => 3, 'raw_begin' => 144, 'element_begin' => 143, 'element_length' => 5}
=head3 isSameNode
my $flag = $node->isSameNode($other_node);
Tests whether two nodes are the same, that is if they reference the same object.
my $tree = HTML5::DOM->new->parse('
');
my $li = $tree->find('li');
print $li->[0]->isSameNode($li->[0]); # 1
print $li->[0]->isSameNode($li->[1]); # 0
print $li->[0]->isSameNode($li->[2]); # 0
=head1 HTML5::DOM::Element
DOM node object for elements. Inherit all methods from L.
=head3 children
my $collection = $node->children;
Returns all child elements of current node in L.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->at('ul')->children;
print $collection->[0]->html; # Perl
print $collection->[1]->html; # PHP
print $collection->[2]->html; # C++
=head3 childrenNode
=head3 childNodes
my $collection = $node->childrenNode;
my $collection = $node->childNodes; # alias
Returns all child nodes of current node in L.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->at('ul')->childrenNode;
print $collection->[0]->html; # Perl
print $collection->[1]->html; #
print $collection->[2]->html; # PHP
print $collection->[3]->html; # C++
=head3 attr
=head3 removeAttr
Universal attributes accessor, for single human-friendly api.
# attribute get
my $value = $node->attr($key);
# attribute set
my $node = $node->attr($key, $value);
my $node = $node->attr($key => $value);
# attribute remove
my $node = $node->attr($key, undef);
my $node = $node->attr($key => undef);
my $node = $node->removeAttr($key);
# bulk attributes set
my $node = $node->attr({$key => $value, $key2 => $value2});
# bulk attributes remove
my $node = $node->attr({$key => undef, $key2 => undef});
# bulk get all attributes in hash
my $hash = $node->attr;
Example:
my $tree = HTML5::DOM->new->parse('
');
my $div = $tree->at('#test');
$div->attr("data-new", "test");
print $div->attr("data-test"); # test value
print $div->{"data-test"}; # test value
print $div->attr->{"data-test"}; # test value
# {id => "test", "data-test" => "test value", "data-href" => "#", "data-new" => "test"}
print Dumper($div->attr);
$div->removeAttr("data-test");
# {id => "test", "data-href" => "#", "data-new" => "test"}
print Dumper($div->attr);
=head3 attrArray
my $arr = $node->attrArray;
Get all attributes in array (in tree order).
my $tree = HTML5::DOM->new->parse('
');
my $div = $tree->at('#test');
# [{key => 'id', value => 'test'}, {key => 'data-test', value => 'test'}, {key => 'data-href', value => '#'}]
print Dumper($div->attrArray);
=head3 getAttribute
my $value = $node->getAttribute($key);
my $value = $node->attr($key); # alias
Get attribute value by key.
=head3 setAttribute
my $node = $node->setAttribute($key, $value);
my $node = $node->attr($key, $value); # alias
Set new value or create new attibute.
=head3 removeAttribute
my $node = $node->removeAttribute($key);
my $node = $node->removeAttr($key); # alias
Remove attribute.
=head3 className
my $classes = $node->className;
# alias for
my $classes = $node->attr("class");
=head3 classList
my $class_list = $node->classList;
# has class
my $flag = $class_list->has($class_name);
my $flag = $class_list->contains($class_name);
# add class
my $class_list = $class_list->add($class_name);
my $class_list = $class_list->add($class_name, $class_name1, $class_name2, ...);
# add class
my $class_list = $class_list->remove($class_name);
my $class_list = $class_list->remove($class_name, $class_name1, $class_name2, ...);
# toggle class
my $state = $class_list->toggle($class_name);
my $state = $class_list->toggle($class_name, $force_state);
Manipulations with classes. Returns L.
Similar to L
my $tree = HTML5::DOM->new->parse('red
')
my $node = $tree->body->at('.red');
print $node->has('red'); # 1
print $node->has('blue'); # 0
$node->add('blue', 'red', 'yellow', 'orange');
print $node->className; # red blue yellow orange
$node->remove('blue', 'orange');
print $node->className; # red yellow
print $node->toggle('blue'); # 1
print $node->className; # red yellow blue
print $node->toggle('blue'); # 0
print $node->className; # red yellow
=head3 at
=head3 querySelector
my $node = $node->at($selector);
my $node = $node->at($selector, $combinator);
my $node = $node->querySelector($selector); # alias
my $node = $node->querySelector($selector, $combinator); # alias
Find one element node in current node descendants using L
Return node, or C if not find.
=over
=item *
C<$selector> - selector query as plain text or precompiled as L or
L.
=item *
C<$combinator> - custom selector combinator, applies to current node
=over
=item *
CE> - descendant selector (default)
=item *
C> - child selector
=item *
C<+> - adjacent sibling selector
=item *
C<~> - general sibling selector
=item *
C<||> - column combinator
=back
=back
my $tree = HTML5::DOM->new->parse('red
blue
')
my $node = $tree->body->at('body > div.red');
print $node->html; # red
=head3 find
=head3 querySelectorAll
my $collection = $node->find($selector);
my $collection = $node->find($selector, $combinator);
my $collection = $node->querySelectorAll($selector); # alias
my $collection = $node->querySelectorAll($selector, $combinator); # alias
Find all element nodes in current node descendants using L
Return L.
=over
=item *
C<$selector> - selector query as plain text or precompiled as L or
L.
=item *
C<$combinator> - custom selector combinator, applies to current node
=over
=item *
CE> - descendant selector (default)
=item *
C> - child selector
=item *
C<+> - adjacent sibling selector
=item *
C<~> - general sibling selector
=item *
C<||> - column combinator
=back
=back
my $tree = HTML5::DOM->new->parse('red
blue
')
my $collection = $tree->body->at('body > div.red, body > div.blue');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
=head3 findId
=head3 getElementById
my $node = $node->findId($tag);
my $node = $node->getElementById($tag); # alias
Find element node with specified id in current node descendants.
Return L or C.
my $tree = HTML5::DOM->new->parse('red
blue
')
my $node = $tree->body->findId('test');
print $node->html; # blue
=head3 findTag
=head3 getElementsByTagName
my $node = $node->findTag($tag);
my $node = $node->getElementsByTagName($tag); # alias
Find all element nodes in current node descendants with specified tag name.
Return L.
my $tree = HTML5::DOM->new->parse('red
blue
')
my $collection = $tree->body->findTag('div');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
=head3 findClass
=head3 getElementsByClassName
my $collection = $node->findClass($class);
my $collection = $node->getElementsByClassName($class); # alias
Find all element nodes in current node descendants with specified class name.
This is more fast equivalent to [class~="value"] selector.
Return L.
my $tree = HTML5::DOM->new
->parse('red
blue
');
my $collection = $tree->body->findClass('color');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
=head3 findAttr
=head3 getElementByAttribute
# Find all elements with attribute
my $collection = $node->findAttr($attribute);
my $collection = $node->getElementByAttribute($attribute); # alias
# Find all elements with attribute and mathcing value
my $collection = $node->findAttr($attribute, $value, $case = 0, $cmp = '=');
my $collection = $node->getElementByAttribute($attribute, $value, $case = 0, $cmp = '='); # alias
Find all element nodes in tree with specified attribute and optional matching value.
Return L.
my $tree = HTML5::DOM->new
->parse('red
blue
');
my $collection = $tree->body->findAttr('class', 'CoLoR', 1, '~');
print $collection->[0]->html; # red
print $collection->[1]->html; # blue
CSS selector analogs:
# [$attribute=$value]
my $collection = $node->findAttr($attribute, $value, 0, '=');
# [$attribute=$value i]
my $collection = $node->findAttr($attribute, $value, 1, '=');
# [$attribute~=$value]
my $collection = $node->findAttr($attribute, $value, 0, '~');
# [$attribute|=$value]
my $collection = $node->findAttr($attribute, $value, 0, '|');
# [$attribute*=$value]
my $collection = $node->findAttr($attribute, $value, 0, '*');
# [$attribute^=$value]
my $collection = $node->findAttr($attribute, $value, 0, '^');
# [$attribute$=$value]
my $collection = $node->findAttr($attribute, $value, 0, '$');
=head3 getDefaultBoxType
my $display = $node->getDefaultBoxType;
Get default CSS "display" property for tag (useful for functions like a L).
my $tree = HTML5::DOM->new
->parse('red
bbb ');
print $tree->at('div')->getDefaultBoxType(); # block
print $tree->at('script')->getDefaultBoxType(); # none
print $tree->at('b')->getDefaultBoxType(); # inline
=head1 HTML5::DOM::Document
DOM node object for document. Inherit all methods from L.
=head1 HTML5::DOM::Fragment
DOM node object for fragments. Inherit all methods from L.
=head1 HTML5::DOM::Text
DOM node object for text. Inherit all methods from L.
=head1 HTML5::DOM::Comment
DOM node object for comments. Inherit all methods from L.
=head1 HTML5::DOM::DocType
DOM node object for document type. Inherit all methods from L.
=head3 name
my $name = $node->name;
my $node = $node->name($new_name);
Return or change root element name from doctype.
my $tree = HTML5::DOM->new->parse('
');
# get
print $tree->document->firstChild->name; # svg
# set
$tree->document->firstChild->name('html');
print $tree->document->firstChild->html; #
=head3 publicId
my $public_id = $node->publicId;
my $node = $node->publicId($new_public_id);
Return or change public id from doctype.
my $tree = HTML5::DOM->new->parse('
');
# get
print $tree->document->firstChild->publicId; # -//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN
# set
print $tree->document->firstChild->publicId('-//W3C//DTD SVG 1.1//EN');
print $tree->document->firstChild->html; #
=head3 systemId
my $system_id = $node->systemId;
my $node = $node->systemId($new_system_id);
Return or change public id from doctype.
my $tree = HTML5::DOM->new->parse('
');
# get
print $tree->document->firstChild->systemId; # http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd
# set
print $tree->document->firstChild->systemId('http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd');
print $tree->document->firstChild->html; #
=head1 HTML5::DOM::Collection
CSS Parser object
=head3 new
my $collection = HTML5::DOM::Collection->new($nodes);
Creates new collection from C<$nodes> (reference to array with L).
=head3 each
$collection->each(sub {...});
$collection->each(sub {...}, @additional_args);
Foreach all nodes in collection. Returns self.
Example:
$collection->each(sub {
my ($node, $index) = @_;
print "FOUND: node[$index] is a '$node'\n";
});
# Also can bypass additional arguments
$collection->each(sub {
my ($node, $index, $title) = @_;
print $title."node[$index] is a '$node'\n";
}, "FOUND: ");
=head3 map
my $new_collection = $collection->map(sub {
my ($token, $index) = @_;
return "FOUND: ".$node->tag." => $index";
});
# Also can bypass additional arguments
my $new_collection = $collection->map(sub {
my ($token, $index, $title) = @_;
return $title.$node->tag." => $index";
}, "FOUND: ");
Apply callback for each node in collection. Returns new array from results.
my $new_collection = $collection->map($method, @args);
Call method for each node in collection. Returns new L from results.
Example:
# set text 'test!' for all nodes
$collection->map('text', 'test!');
# get all tag names as array
my $new_collection = $collection->map('tag');
# remove all nodes in collection
$collection->map('remove');
=head3 add
my $collection = $collection->add($node);
Add new item to collection.
=head3 length
my $length = $collection->length;
Items count in collection.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print $collection->length; # 3
=head3 grep
my $new_collection = $collection->grep(qr/regexp/);
Evaluates regexp for html code of each element in collection and creates new collection with all matched elements.
my $new_collection = $collection->grep(sub {...});
my $new_collection = $collection->grep(sub {...}, @args);
Evaluates callback foreach element in collection and creates new collection with all elements for which callback returned true.
Example for regexp:
my $tree = HTML5::DOM->new->parse('
Linux
OSX (not supported)
Windows (not supported)
');
my $collection = $tree->find('ul li')->grep(qr/not supported/);
print $collection->length; # 2
Example for callback:
my $tree = HTML5::DOM->new->parse('
Linux
OSX (not supported)
Windows (not supported)
');
my $collection = $tree->find('ul li')->grep(sub { $_->html =~ /not supported/ });
print $collection->length; # 2
=head3 first
my $node = $collection->first;
Get first item in collection.
my $node = $collection->first(qr/regexp/);
Get first element in collection which html code matches regexp.
my $node = $collection->first(sub {...});
my $node = $collection->first(sub {...}, @args);
Get first element in collection which where callback returned true.
Example for regexp:
my $tree = HTML5::DOM->new->parse('
Linux
OSX (not supported)
Windows (not supported)
');
my $collection = $tree->find('ul li');
print $collection->first->html; # Linux
print $collection->first(qr/not supported/)->html; # OSX (not supported)
Example for callback:
my $tree = HTML5::DOM->new->parse('
Linux
OSX (not supported)
Windows (not supported)
');
my $collection = $tree->find('ul li');
print $collection->first->html; # Linux
print $collection->first(sub { $_->html =~ /not supported })->html; # OSX (not supported)
=head3 last
my $node = $collection->last;
Get last item in collection.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print $collection->last->html; # Windows
=head3 item
my $node = $collection->item($index);
my $node = $collection->[$index];
Get item by C<$index> in collection.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print $collection->item(1)->html; # OSX
print $collection->[1]->html; # OSX
=head3 reverse
my $reversed_collection = $collection->reverse;
Returns copy of collection in reverse order.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print join(', ', @{$collection->map('text')}; # Linux, OSX, Windows
print join(', ', @{$collection->reverse()->map('text')}; # Windows, OSX, Linux
=head3 shuffle
my $shuffled_collection = $collection->shuffle;
Returns copy of collection in random order.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print join(', ', @{$collection->shuffle()->map('text')}; # Windows, Linux, OSX
print join(', ', @{$collection->shuffle()->map('text')}; # Windows, OSX, Linux
print join(', ', @{$collection->shuffle()->map('text')}; # OSX, Windows, Linux
=head3 head
my $new_collection = $collection->head($length);
Returns copy of collection with only first C<$length> items.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print join(', ', @{$collection->head(2)->map('text')}; # Linux, OSX
=head3 tail
my $new_collection = $collection->tail($length);
Returns copy of collection with only last C<$length> items.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print join(', ', @{$collection->tail(2)->map('text')}; # OSX, Windows
=head3 slice
my $new_collection = $collection->slice($offset);
Returns new collection with sequence by specified C<$offset>.
If C<$offset> is positive, the sequence will start at that C<$offset> in the C<$collection>.
If C<$offset> is negative, the sequence will start that far from the end of the C<$collection>.
my $new_collection = $collection->slice($offset, $length);
Returns new collection with sequence by specified C<$offset> and C<$length>.
If C<$offset> is positive, the sequence will start at that C<$offset> in the C<$collection>.
If C<$offset> is negative, the sequence will start that far from the end of the C<$collection>.
If C<$length> is positive, then the sequence will have up to that many elements in it.
If the C<$collection> is shorter than the C<$length>, then only the available C<$collection> elements will be present.
If C<$length> is negative then the sequence will stop that many elements from the end of the C<$collection>.
my $tree = HTML5::DOM->new->parse('
');
my $collection = $tree->find('ul li');
print join(', ', @{$collection->slice(1)->map('text')}; # NetBSD, OSX, Windows
print join(', ', @{$collection->slice(1, 2)->map('text')}; # NetBSD, OSX
print join(', ', @{$collection->slice(-2)->map('text')}; # OSX, Windows
print join(', ', @{$collection->slice(-2, 1)->map('text')}; # OSX
print join(', ', @{$collection->slice(-3, -1)->map('text')}; # NetBSD, OSX
=head3 uniq
my $new_collection = $collection->uniq();
Returns copy of collection with only uniq nodes.
my $new_collection = $collection->uniq(sub {...});
Returns copy of collection with only unique nodes which unique identifier of each node returned by callback.
Example:
my $tree = HTML5::DOM->new->parse('
Ubuntu
Arch Linux
OSX
Windows
');
my $collection = $tree->find('ul li');
print join(', ', @{$collection->uniq->map('text')}; # Ubuntu, Arch Linux, OSX, Windows
print join(', ', @{$collection->uniq(sub { $_->attr("data-kernel") })->map('text')}; # Ubuntu, OSX, Windows
=head3 array
my $node = $collection->array();
Get collection items as array.
=head3 html
my $html = $collection->html;
Concat from all items.
=head3 text
my $text = $collection->text;
Concat from all items.
=head1 HTML5::DOM::TokenList
Similar to L
=head3 has
=head3 contains
my $flag = $tokens->has($token);
my $flag = $tokens->contains($token); # alias
Check if token contains in current tokens list.
=head3 add
my $tokens = $tokens->add($token);
my $tokens = $tokens->add($token, $token2, ...);
Add new token (or tokens) to current tokens list. Returns self.
=head3 remove
my $tokens = $tokens->add($token);
my $tokens = $tokens->add($token, $token2, ...);
Remove one or more tokens from current tokens list. Returns self.
=head3 toggle
my $state = $tokens->toggle($token);
my $state = $tokens->toggle($token, $force_state);
=over
=item *
C<$token> - specified token name
=item *
C<$force_state> - optional force state.
If 1 - similar to L
If 0 - similar to L
=back
Toggle specified token in current tokens list.
=over
=item *
If token exists - remove it
=item *
If token not exists - add it
=back
=head3 length
my $length = $tokens->length;
Returns tokens count in current list.
=head3 item
my $token = $tokens->item($index);
my $token = $tokens->[$index];
Return token by index.
=head3 each
my $token = $tokens->each(sub {
my ($token, $index) = @_;
print "tokens[$index] is a '$token'\n";
});
Forach all tokens in list.
=head1 HTML5::DOM::AsyncResult
Get result and check status from async parsing.
=head3 parsed
Non-blocking check status.
use warnings;
use strict;
use HTML5::DOM;
my $parser = HTML5::DOM->new;
my $async = $parser->parseAsync('Hello world!
' x 1000);
my $is_parsed;
while (!($is_parsed = $async->parsed)) {
print "is_parsed=$is_parsed\n";
}
Returns 1 if async parsing done. Otherwise returns 0.
=head3 tree
Non-blocking get result.
use warnings;
use strict;
use HTML5::DOM;
my $parser = HTML5::DOM->new;
my $async = $parser->parseAsync('Hello world!
' x 1000);
my $tree;
while (!($tree = $async->tree)) {
print "is_parsed=".($tree ? 1 : 0)."\n";
}
print $tree->at('div')->text."\n"; # Hello world!
Returns L object if async parsing done. Otherwise returns C.
=head3 wait
use warnings;
use strict;
use HTML5::DOM;
my $parser = HTML5::DOM->new;
my $async = $parser->parseAsync('Hello world!
' x 1000);
my $tree = $async->wait;
print $tree->at('div')->text."\n"; # Hello world!
Blocking waits for parsing done and returns L object.
=head1 HTML5::DOM::CSS
CSS Parser object
=head3 new
# with default options
my $css = HTML5::DOM::CSS->new;
# or override some options, if you need
my $css = HTML5::DOM::CSS->new({
utf8 => 0
});
Create new css parser object wuth options. See L<"CSS PARSER OPTIONS"> for details.
=head3 parseSelector
my $selector = HTML5::DOM::CSS->parseSelector($selector_text);
Parse C<$selector_text> and return L.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
# with custom options (extends options defined in HTML5::DOM::CSS->new)
my $selector = $css->parseSelector('body div.red, body span.blue', { utf8 => 0 });
=head1 HTML5::DOM::CSS::Selector
CSS Selector object (precompiled selector)
=head3 new
my $selector = HTML5::DOM::CSS::Selector->new($selector_text);
Parse C<$selector_text> and create new css selector object.
If your need parse many selectors, more efficient way using
single instance of parser L and
L method.
=head3 text
my $selector_text = $selector->text;
Serialize selector to text.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
print $selector->text."\n"; # body div.red, body span.blue
=head3 ast
my $ast = $entry->ast;
Serialize selector to very simple AST format.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('div > .red');
print Dumper($selector->ast);
# $VAR1 = [[
# {
# 'value' => 'div',
# 'type' => 'tag'
# },
# {
# 'type' => 'combinator',
# 'value' => 'child'
# },
# {
# 'type' => 'class',
# 'value' => 'red'
# }
# ]];
=head3 length
my $length = $selector->length;
Get selector entries count (selectors separated by "," combinator)
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
print $selector->length."\n"; # 2
=head3 entry
my $entry = $selector->entry($index);
Get selector entry by C<$index> end return L.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
print $selector->entry(0)->text."\n"; # body div.red
print $selector->entry(1)->text."\n"; # body span.blue
=head3 utf8
As getter - get C<1> if current selector object returns all strings with utf8 flag.
Example with utf8:
use warnings;
use strict;
use HTML5::DOM;
use utf8;
my $selector = HTML5::DOM::CSS->new->parseSelector("[name=\"тест\"]");
my $is_utf8_enabled = $selector->utf8;
print "is_utf8_enabled=".($is_utf8_enabled ? "true" : "false")."\n"; # true
Or example with bytes:
use warnings;
use strict;
use HTML5::DOM;
my $selector = HTML5::DOM::CSS->new->parseSelector("[name=\"тест\"]");
my $is_utf8_enabled = $selector->utf8;
print "is_utf8_enabled=".($is_utf8_enabled ? "true" : "false")."\n"; # false
As setter - enable or disable utf8 flag on all returned strings.
use warnings;
use strict;
use HTML5::DOM;
use utf8;
my $selector = HTML5::DOM::CSS->new->parseSelector("[name=\"тест\"]");
print "is_utf8_enabled=".($selector->utf8 ? "true" : "false")."\n"; # true
print length($selector->text)." chars\n"; # 13 chars
$selector->utf8(0);
print "is_utf8_enabled=".($selector->utf8 ? "true" : "false")."\n"; # false
print length($selector->text)." bytes\n"; # 17 bytes
=head1 HTML5::DOM::CSS::Selector::Entry
CSS selector entry object (precompiled selector)
=head3 text
my $selector_text = $entry->text;
Serialize entry to text.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
my $entry = $selector->entry(0);
print $entry->text."\n"; # body div.red
=head3 pseudoElement
my $pseudo_name = $entry->pseudoElement;
Return pseudo-element name for entry.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('div::after');
my $entry = $selector->entry(0);
print $entry->pseudoElement."\n"; # after
=head3 ast
my $ast = $entry->ast;
Serialize entry to very simple AST format.
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('div > .red');
my $entry = $selector->entry(0);
print Dumper($entry->ast);
# $VAR1 = [
# {
# 'value' => 'div',
# 'type' => 'tag'
# },
# {
# 'type' => 'combinator',
# 'value' => 'child'
# },
# {
# 'type' => 'class',
# 'value' => 'red'
# }
# ];
=head3 specificity
my $specificity = $entry->specificity;
Get specificity in hash C<{a, b, c}>
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
my $entry = $selector->entry(0);
print Dumper($entry->specificity); # {a => 0, b => 1, c => 2}
=head3 specificityArray
my $specificity = $entry->specificityArray;
Get specificity in array C<[a, b, c]> (ordered by weight)
my $css = HTML5::DOM::CSS->new;
my $selector = $css->parseSelector('body div.red, body span.blue');
my $entry = $selector->entry(0);
print Dumper($entry->specificityArray); # [0, 1, 2]
=head1 HTML5::DOM::Encoding
Encoding detection.
See for available encodings: L
=head3 id2name
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
Get encoding name by id.
print HTML5::DOM::Encoding::id2name(HTML5::DOM::Encoding->UTF_8); # UTF-8
=head3 name2id
my $encoding_id = HTML5::DOM::Encoding::name2id($encoding);
Get id by name.
print HTML5::DOM::Encoding->UTF_8; # 0
print HTML5::DOM::Encoding::id2name("UTF-8"); # 0
=head3 detectAuto
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectAuto($text, $max_length = 0);
Auto detect text encoding using (in this order):
=over
=item *
L
=item *
L
=item *
L
=back
Returns array with encoding id and new text without BOM, if success.
If fail, then encoding id equal HTML5::DOM::Encoding->NOT_DETERMINED.
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectAuto("ололо");
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # UTF-8
=head3 detect
my $encoding_id = HTML5::DOM::Encoding::detect($text, $max_length = 0);
Detect text encoding. Single method for both L and L.
Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
my $encoding_id = HTML5::DOM::Encoding::detect("ололо");
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # UTF-8
=head3 detectCyrillic
my $encoding_id = HTML5::DOM::Encoding::detectCyrillic($text, $max_length = 0);
Detect cyrillic text encoding (using lowercase B), such as C, C, C, C, C.
Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
This method also have aliases for compatibility reasons: C, C
=head3 detectUnicode
my $encoding_id = HTML5::DOM::Encoding::detectUnicode($text, $max_length = 0);
Detect unicode family text encoding, such as C, C, C.
Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
# get UTF-16LE data for test
my $str = "ололо";
Encode::from_to($str, "UTF-8", "UTF-16LE");
my $encoding_id = HTML5::DOM::Encoding::detectUnicode($str);
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # UTF-16LE
=head3 detectByPrescanStream
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream($text, $max_length = 0);
Detect encoding by parsing CmetaE> tags in html.
Returns encoding id, if success. And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
See for more info: L
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
');
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # WINDOWS-1251
=head3 detectByCharset
my $encoding_id = HTML5::DOM::Encoding::detectByCharset($text, $max_length = 0);
Extracting character encoding from string. Find "charset=" and see encoding. Return found raw data.
For example: "text/html; charset=windows-1251". Return HTML5::DOM::Encoding->WINDOWS_1251
And returns HTML5::DOM::Encoding->NOT_DETERMINED if fail.
See for more info: L
my $encoding_id = HTML5::DOM::Encoding::detectByPrescanStream('
');
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # WINDOWS-1251
=head3 detectBomAndCut
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut($text, $max_length = 0);
Returns array with encoding id and new text without BOM.
If fail, then encoding id equal HTML5::DOM::Encoding->NOT_DETERMINED.
my ($encoding_id, $new_text) = HTML5::DOM::Encoding::detectBomAndCut("\xEF\xBB\xBFололо");
my $encoding = HTML5::DOM::Encoding::id2name($encoding_id);
print $encoding; # UTF-8
print $new_text; # ололо
=head1 NAMESPACES
=head3 Supported namespace names
html, matml, svg, xlink, xml, xmlns
=head3 Supported namespace id constants
HTML5::DOM->NS_UNDEF
HTML5::DOM->NS_HTML
HTML5::DOM->NS_MATHML
HTML5::DOM->NS_SVG
HTML5::DOM->NS_XLINK
HTML5::DOM->NS_XML
HTML5::DOM->NS_XMLNS
HTML5::DOM->NS_ANY
HTML5::DOM->NS_LAST_ENTRY
=head1 TAGS
HTML5::DOM->TAG__UNDEF
HTML5::DOM->TAG__TEXT
HTML5::DOM->TAG__COMMENT
HTML5::DOM->TAG__DOCTYPE
HTML5::DOM->TAG_A
HTML5::DOM->TAG_ABBR
HTML5::DOM->TAG_ACRONYM
HTML5::DOM->TAG_ADDRESS
HTML5::DOM->TAG_ANNOTATION_XML
HTML5::DOM->TAG_APPLET
HTML5::DOM->TAG_AREA
HTML5::DOM->TAG_ARTICLE
HTML5::DOM->TAG_ASIDE
HTML5::DOM->TAG_AUDIO
HTML5::DOM->TAG_B
HTML5::DOM->TAG_BASE
HTML5::DOM->TAG_BASEFONT
HTML5::DOM->TAG_BDI
HTML5::DOM->TAG_BDO
HTML5::DOM->TAG_BGSOUND
HTML5::DOM->TAG_BIG
HTML5::DOM->TAG_BLINK
HTML5::DOM->TAG_BLOCKQUOTE
HTML5::DOM->TAG_BODY
HTML5::DOM->TAG_BR
HTML5::DOM->TAG_BUTTON
HTML5::DOM->TAG_CANVAS
HTML5::DOM->TAG_CAPTION
HTML5::DOM->TAG_CENTER
HTML5::DOM->TAG_CITE
HTML5::DOM->TAG_CODE
HTML5::DOM->TAG_COL
HTML5::DOM->TAG_COLGROUP
HTML5::DOM->TAG_COMMAND
HTML5::DOM->TAG_COMMENT
HTML5::DOM->TAG_DATALIST
HTML5::DOM->TAG_DD
HTML5::DOM->TAG_DEL
HTML5::DOM->TAG_DETAILS
HTML5::DOM->TAG_DFN
HTML5::DOM->TAG_DIALOG
HTML5::DOM->TAG_DIR
HTML5::DOM->TAG_DIV
HTML5::DOM->TAG_DL
HTML5::DOM->TAG_DT
HTML5::DOM->TAG_EM
HTML5::DOM->TAG_EMBED
HTML5::DOM->TAG_FIELDSET
HTML5::DOM->TAG_FIGCAPTION
HTML5::DOM->TAG_FIGURE
HTML5::DOM->TAG_FONT
HTML5::DOM->TAG_FOOTER
HTML5::DOM->TAG_FORM
HTML5::DOM->TAG_FRAME
HTML5::DOM->TAG_FRAMESET
HTML5::DOM->TAG_H1
HTML5::DOM->TAG_H2
HTML5::DOM->TAG_H3
HTML5::DOM->TAG_H4
HTML5::DOM->TAG_H5
HTML5::DOM->TAG_H6
HTML5::DOM->TAG_HEAD
HTML5::DOM->TAG_HEADER
HTML5::DOM->TAG_HGROUP
HTML5::DOM->TAG_HR
HTML5::DOM->TAG_HTML
HTML5::DOM->TAG_I
HTML5::DOM->TAG_IFRAME
HTML5::DOM->TAG_IMAGE
HTML5::DOM->TAG_IMG
HTML5::DOM->TAG_INPUT
HTML5::DOM->TAG_INS
HTML5::DOM->TAG_ISINDEX
HTML5::DOM->TAG_KBD
HTML5::DOM->TAG_KEYGEN
HTML5::DOM->TAG_LABEL
HTML5::DOM->TAG_LEGEND
HTML5::DOM->TAG_LI
HTML5::DOM->TAG_LINK
HTML5::DOM->TAG_LISTING
HTML5::DOM->TAG_MAIN
HTML5::DOM->TAG_MAP
HTML5::DOM->TAG_MARK
HTML5::DOM->TAG_MARQUEE
HTML5::DOM->TAG_MENU
HTML5::DOM->TAG_MENUITEM
HTML5::DOM->TAG_META
HTML5::DOM->TAG_METER
HTML5::DOM->TAG_MTEXT
HTML5::DOM->TAG_NAV
HTML5::DOM->TAG_NOBR
HTML5::DOM->TAG_NOEMBED
HTML5::DOM->TAG_NOFRAMES
HTML5::DOM->TAG_NOSCRIPT
HTML5::DOM->TAG_OBJECT
HTML5::DOM->TAG_OL
HTML5::DOM->TAG_OPTGROUP
HTML5::DOM->TAG_OPTION
HTML5::DOM->TAG_OUTPUT
HTML5::DOM->TAG_P
HTML5::DOM->TAG_PARAM
HTML5::DOM->TAG_PLAINTEXT
HTML5::DOM->TAG_PRE
HTML5::DOM->TAG_PROGRESS
HTML5::DOM->TAG_Q
HTML5::DOM->TAG_RB
HTML5::DOM->TAG_RP
HTML5::DOM->TAG_RT
HTML5::DOM->TAG_RTC
HTML5::DOM->TAG_RUBY
HTML5::DOM->TAG_S
HTML5::DOM->TAG_SAMP
HTML5::DOM->TAG_SCRIPT
HTML5::DOM->TAG_SECTION
HTML5::DOM->TAG_SELECT
HTML5::DOM->TAG_SMALL
HTML5::DOM->TAG_SOURCE
HTML5::DOM->TAG_SPAN
HTML5::DOM->TAG_STRIKE
HTML5::DOM->TAG_STRONG
HTML5::DOM->TAG_STYLE
HTML5::DOM->TAG_SUB
HTML5::DOM->TAG_SUMMARY
HTML5::DOM->TAG_SUP
HTML5::DOM->TAG_SVG
HTML5::DOM->TAG_TABLE
HTML5::DOM->TAG_TBODY
HTML5::DOM->TAG_TD
HTML5::DOM->TAG_TEMPLATE
HTML5::DOM->TAG_TEXTAREA
HTML5::DOM->TAG_TFOOT
HTML5::DOM->TAG_TH
HTML5::DOM->TAG_THEAD
HTML5::DOM->TAG_TIME
HTML5::DOM->TAG_TITLE
HTML5::DOM->TAG_TR
HTML5::DOM->TAG_TRACK
HTML5::DOM->TAG_TT
HTML5::DOM->TAG_U
HTML5::DOM->TAG_UL
HTML5::DOM->TAG_VAR
HTML5::DOM->TAG_VIDEO
HTML5::DOM->TAG_WBR
HTML5::DOM->TAG_XMP
HTML5::DOM->TAG_ALTGLYPH
HTML5::DOM->TAG_ALTGLYPHDEF
HTML5::DOM->TAG_ALTGLYPHITEM
HTML5::DOM->TAG_ANIMATE
HTML5::DOM->TAG_ANIMATECOLOR
HTML5::DOM->TAG_ANIMATEMOTION
HTML5::DOM->TAG_ANIMATETRANSFORM
HTML5::DOM->TAG_CIRCLE
HTML5::DOM->TAG_CLIPPATH
HTML5::DOM->TAG_COLOR_PROFILE
HTML5::DOM->TAG_CURSOR
HTML5::DOM->TAG_DEFS
HTML5::DOM->TAG_DESC
HTML5::DOM->TAG_ELLIPSE
HTML5::DOM->TAG_FEBLEND
HTML5::DOM->TAG_FECOLORMATRIX
HTML5::DOM->TAG_FECOMPONENTTRANSFER
HTML5::DOM->TAG_FECOMPOSITE
HTML5::DOM->TAG_FECONVOLVEMATRIX
HTML5::DOM->TAG_FEDIFFUSELIGHTING
HTML5::DOM->TAG_FEDISPLACEMENTMAP
HTML5::DOM->TAG_FEDISTANTLIGHT
HTML5::DOM->TAG_FEDROPSHADOW
HTML5::DOM->TAG_FEFLOOD
HTML5::DOM->TAG_FEFUNCA
HTML5::DOM->TAG_FEFUNCB
HTML5::DOM->TAG_FEFUNCG
HTML5::DOM->TAG_FEFUNCR
HTML5::DOM->TAG_FEGAUSSIANBLUR
HTML5::DOM->TAG_FEIMAGE
HTML5::DOM->TAG_FEMERGE
HTML5::DOM->TAG_FEMERGENODE
HTML5::DOM->TAG_FEMORPHOLOGY
HTML5::DOM->TAG_FEOFFSET
HTML5::DOM->TAG_FEPOINTLIGHT
HTML5::DOM->TAG_FESPECULARLIGHTING
HTML5::DOM->TAG_FESPOTLIGHT
HTML5::DOM->TAG_FETILE
HTML5::DOM->TAG_FETURBULENCE
HTML5::DOM->TAG_FILTER
HTML5::DOM->TAG_FONT_FACE
HTML5::DOM->TAG_FONT_FACE_FORMAT
HTML5::DOM->TAG_FONT_FACE_NAME
HTML5::DOM->TAG_FONT_FACE_SRC
HTML5::DOM->TAG_FONT_FACE_URI
HTML5::DOM->TAG_FOREIGNOBJECT
HTML5::DOM->TAG_G
HTML5::DOM->TAG_GLYPH
HTML5::DOM->TAG_GLYPHREF
HTML5::DOM->TAG_HKERN
HTML5::DOM->TAG_LINE
HTML5::DOM->TAG_LINEARGRADIENT
HTML5::DOM->TAG_MARKER
HTML5::DOM->TAG_MASK
HTML5::DOM->TAG_METADATA
HTML5::DOM->TAG_MISSING_GLYPH
HTML5::DOM->TAG_MPATH
HTML5::DOM->TAG_PATH
HTML5::DOM->TAG_PATTERN
HTML5::DOM->TAG_POLYGON
HTML5::DOM->TAG_POLYLINE
HTML5::DOM->TAG_RADIALGRADIENT
HTML5::DOM->TAG_RECT
HTML5::DOM->TAG_SET
HTML5::DOM->TAG_STOP
HTML5::DOM->TAG_SWITCH
HTML5::DOM->TAG_SYMBOL
HTML5::DOM->TAG_TEXT
HTML5::DOM->TAG_TEXTPATH
HTML5::DOM->TAG_TREF
HTML5::DOM->TAG_TSPAN
HTML5::DOM->TAG_USE
HTML5::DOM->TAG_VIEW
HTML5::DOM->TAG_VKERN
HTML5::DOM->TAG_MATH
HTML5::DOM->TAG_MACTION
HTML5::DOM->TAG_MALIGNGROUP
HTML5::DOM->TAG_MALIGNMARK
HTML5::DOM->TAG_MENCLOSE
HTML5::DOM->TAG_MERROR
HTML5::DOM->TAG_MFENCED
HTML5::DOM->TAG_MFRAC
HTML5::DOM->TAG_MGLYPH
HTML5::DOM->TAG_MI
HTML5::DOM->TAG_MLABELEDTR
HTML5::DOM->TAG_MLONGDIV
HTML5::DOM->TAG_MMULTISCRIPTS
HTML5::DOM->TAG_MN
HTML5::DOM->TAG_MO
HTML5::DOM->TAG_MOVER
HTML5::DOM->TAG_MPADDED
HTML5::DOM->TAG_MPHANTOM
HTML5::DOM->TAG_MROOT
HTML5::DOM->TAG_MROW
HTML5::DOM->TAG_MS
HTML5::DOM->TAG_MSCARRIES
HTML5::DOM->TAG_MSCARRY
HTML5::DOM->TAG_MSGROUP
HTML5::DOM->TAG_MSLINE
HTML5::DOM->TAG_MSPACE
HTML5::DOM->TAG_MSQRT
HTML5::DOM->TAG_MSROW
HTML5::DOM->TAG_MSTACK
HTML5::DOM->TAG_MSTYLE
HTML5::DOM->TAG_MSUB
HTML5::DOM->TAG_MSUP
HTML5::DOM->TAG_MSUBSUP
HTML5::DOM->TAG__END_OF_FILE
HTML5::DOM->TAG_LAST_ENTRY
=head1 ENCODINGS
=head3 Supported encoding names
AUTO, NOT-DETERMINED, X-USER-DEFINED,
BIG5, EUC-JP, EUC-KR, GB18030, GBK, IBM866, MACINTOSH, X-MAC-CYRILLIC, SHIFT_JIS,
ISO-2022-JP, ISO-8859-10, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16, ISO-8859-2,
ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-8-I,
WINDOWS-1250, WINDOWS-1251, WINDOWS-1252, WINDOWS-1253, WINDOWS-1254,
WINDOWS-1255, WINDOWS-1256, WINDOWS-1257, WINDOWS-1258, WINDOWS-874,
UTF-8, UTF-16BE, UTF-16LE, KOI8-R, KOI8-U
=head3 Supported encoding id consts
HTML5::DOM::Encoding->DEFAULT
HTML5::DOM::Encoding->AUTO
HTML5::DOM::Encoding->NOT_DETERMINED
HTML5::DOM::Encoding->UTF_8
HTML5::DOM::Encoding->UTF_16LE
HTML5::DOM::Encoding->UTF_16BE
HTML5::DOM::Encoding->X_USER_DEFINED
HTML5::DOM::Encoding->BIG5
HTML5::DOM::Encoding->EUC_JP
HTML5::DOM::Encoding->EUC_KR
HTML5::DOM::Encoding->GB18030
HTML5::DOM::Encoding->GBK
HTML5::DOM::Encoding->IBM866
HTML5::DOM::Encoding->ISO_2022_JP
HTML5::DOM::Encoding->ISO_8859_10
HTML5::DOM::Encoding->ISO_8859_13
HTML5::DOM::Encoding->ISO_8859_14
HTML5::DOM::Encoding->ISO_8859_15
HTML5::DOM::Encoding->ISO_8859_16
HTML5::DOM::Encoding->ISO_8859_2
HTML5::DOM::Encoding->ISO_8859_3
HTML5::DOM::Encoding->ISO_8859_4
HTML5::DOM::Encoding->ISO_8859_5
HTML5::DOM::Encoding->ISO_8859_6
HTML5::DOM::Encoding->ISO_8859_7
HTML5::DOM::Encoding->ISO_8859_8
HTML5::DOM::Encoding->ISO_8859_8_I
HTML5::DOM::Encoding->KOI8_R
HTML5::DOM::Encoding->KOI8_U
HTML5::DOM::Encoding->MACINTOSH
HTML5::DOM::Encoding->SHIFT_JIS
HTML5::DOM::Encoding->WINDOWS_1250
HTML5::DOM::Encoding->WINDOWS_1251
HTML5::DOM::Encoding->WINDOWS_1252
HTML5::DOM::Encoding->WINDOWS_1253
HTML5::DOM::Encoding->WINDOWS_1254
HTML5::DOM::Encoding->WINDOWS_1255
HTML5::DOM::Encoding->WINDOWS_1256
HTML5::DOM::Encoding->WINDOWS_1257
HTML5::DOM::Encoding->WINDOWS_1258
HTML5::DOM::Encoding->WINDOWS_874
HTML5::DOM::Encoding->X_MAC_CYRILLIC
HTML5::DOM::Encoding->LAST_ENTRY
=head1 PARSER OPTIONS
Options for:
=over
=item *
L
=item *
L
=item *
L
=item *
L
=back
=head4 threads
Threads count, if < 2 - parsing in single mode without threads (default 0)
This option affects only for L.
Originaly, L can use mulithread parsing.
But in real cases this mode slower than single mode (threads=0). Result speed very OS-specific and depends on input html.
Not recommended use if don't known what you do. B
=head4 ignore_whitespace
Ignore whitespace tokens (default 0)
=head4 ignore_doctype
Do not parse DOCTYPE (default 0)
=head4 scripts
If 1 - contents parsed to single text node (default)
If 0 - contents parsed to child nodes
=head4 encoding
Encoding of input HTML, if C - library can tree to automaticaly determine encoding. (default "auto")
Allowed both encoding name or id.
=head4 default_encoding
Default encoding, this affects only if C set to C and encoding not determined. (default "UTF-8")
Allowed both encoding name or id.
See for available encodings: L
=head4 encoding_use_meta
Allow use CmetaE> tags to determine input HTML encoding. (default 1)
See L.
=head4 encoding_prescan_limit
Limit string length to determine encoding by CmetaE> tags. (default 1024, from spec)
See L.
=head4 encoding_use_bom
Allow use detecding BOM to determine input HTML encoding. (default 1)
See L.
=head4 utf8
Default: C<"auto">
If 1, then all returned strings have utf8 flag (chars).
If 0, then all returned strings haven't utf8 flag (bytes).
If C<"auto">, then utf8 flag detected by input string. Automaticaly enables C if input string have utf8 flag.
C<"auto"> works only in L, L, L methods.
=head1 CSS PARSER OPTIONS
Options for:
=over
=item *
L
=item *
L
=back
=head4 utf8
Default: C<"auto">
If 1, then all returned strings have utf8 flag (chars).
If 0, then all returned strings haven't utf8 flag (bytes).
If C<"auto">, then utf8 flag detected by input string. Automaticaly enables C if input string have utf8 flag.
=head1 HTML5 SUPPORT
Tested with L (at 2021-06-26)
-------------------------------------------------------------
test total ok fail skip
-------------------------------------------------------------
foreign-fragment.dat 66 54 12 0
tests26.dat 19 16 3 0
menuitem-element.dat 19 16 3 0
tests11.dat 12 11 1 0
tests1.dat 112 112 0 0
tests4.dat 6 6 0 0
tests6.dat 51 51 0 0
ruby.dat 20 20 0 0
adoption01.dat 17 17 0 0
tests14.dat 6 6 0 0
tests19.dat 104 104 0 0
tests7.dat 30 30 0 0
noscript01.dat 17 17 0 0
tests17.dat 12 12 0 0
tests23.dat 4 4 0 0
pending-spec-changes.dat 2 2 0 0
tables01.dat 16 16 0 0
entities02.dat 25 25 0 0
tests22.dat 4 4 0 0
tests10.dat 53 53 0 0
tests15.dat 13 13 0 0
inbody01.dat 3 3 0 0
template.dat 107 107 0 0
plain-text-unsafe.dat 32 32 0 0
comments01.dat 15 15 0 0
scriptdata01.dat 26 26 0 0
svg.dat 7 7 0 0
tests25.dat 25 25 0 0
tests3.dat 23 23 0 0
tests20.dat 43 43 0 0
tests12.dat 1 1 0 0
tests21.dat 24 24 0 0
math.dat 7 7 0 0
webkit01.dat 49 49 0 0
main-element.dat 2 2 0 0
adoption02.dat 1 1 0 0
domjs-unsafe.dat 48 48 0 0
tests16.dat 196 196 0 0
blocks.dat 47 47 0 0
tests5.dat 16 16 0 0
tests8.dat 9 9 0 0
tricky01.dat 8 8 0 0
tests18.dat 35 35 0 0
webkit02.dat 20 20 0 0
tests24.dat 7 7 0 0
html5test-com.dat 23 23 0 0
isindex.dat 3 3 0 0
doctype01.dat 36 36 0 0
entities01.dat 74 74 0 0
tests2.dat 61 61 0 0
tests9.dat 26 26 0 0
tests_innerHTML_1.dat 84 84 0 0
summary 1666 1647 19 0
Tested with C
perl examples/html5lib_tests.pl --dir=../html5lib-tests/tree-construction --colordiff
Send patches to lexborisov's L if you want improve this result.
=head1 WORK WITH UTF8
In normal cases you must don't care about utf8. Everything works out of the box.
By default utf8 mode enabled automaticaly if you specify string with utf8 flag.
For example:
Perfect work with C:
use warnings;
use strict;
use HTML5::DOM;
use utf8;
my $parser = HTML5::DOM->new;
my $str = HTML5::DOM->new->parse('тест тест ')->at('b')->text;
print "length=".length($str)." [$str]\n"; # length=9 [тест тест]
Perfect work without C:
use warnings;
use strict;
use HTML5::DOM;
# Perfect work with default mode of perl strings (bytes)
my $parser = HTML5::DOM->new;
my $str = HTML5::DOM->new->parse('тест тест ')->at('b')->text;
print "length=".length($str)." [$str]\n"; # length=17 [тест тест]
# You can pass string with utf8 flag without "use utf8" and it perfect works
use Encode;
my $test = 'тест тест ';
Encode::_utf8_on($test);
$str = HTML5::DOM->new->parse($test)->at('b')->text;
print "length=".length($str)." [$str]\n"; # length=9 [тест тест]
But you can override this behavior - see L<"PARSER OPTIONS"> for details.
Force use bytes:
use warnings;
use strict;
use HTML5::DOM;
use utf8;
my $parser = HTML5::DOM->new({ utf8 => 0 });
my $str = $parser->parse('тест тест ')->at('b')->text;
print "length=".length($str)." [$str]\n"; # length=17 [тест тест]
Force use utf8:
use warnings;
use strict;
use HTML5::DOM;
my $parser = HTML5::DOM->new({ utf8 => 1 });
my $str = $parser->parse('тест тест ')->at('b')->text;
print "length=".length($str)." [$str]\n"; # length=13 [тест тест]
=head1 BUGS
L
=head1 SEE ALSO
=over
=item *
L - more low-level myhtml bindings.
=item *
L - pure perl HTML5 DOM library with CSS selectors.
=back
=head1 AUTHOR
Kirill Zhumarin
=head1 LICENSE
=over
=item *
HTML5::DOM - L
=item *
Modest - L
=item *
MyHTML - L
=item *
MyCSS - L
=back