lib/ES/DocsParser.pm (135 lines of code) (raw):
package ES::DocsParser;
use strict;
use warnings;
use parent 'HTML::Parser';
#===================================
sub new {
#===================================
shift()->SUPER::new(
api_version => 3,
ignore_elements => [ 'script', 'style', 'head', 'pre' ],
report_tags => [
"a", "article", "aside", "blockquote",
"br", "caption", "dd", "div",
"dl", "dt", "figcaption", "h1",
"h2", "h3", "h4", "h5",
"h6", "header", "li", "output",
"p", "pre", "section", "textarea",
"th"
],
handlers => {
text => [ \&text, 'self, dtext' ],
start => [ \&start, 'self, tagname, attr' ],
end => [ \&end, 'self, tagname' ],
comment => [ \&comment, 'self, token0' ],
default => ['']
},
empty_element_tags => 1,
);
}
#===================================
sub comment {
#===================================
my ( $self, $comment ) = @_;
if ( $comment eq ' start body ' ) {
$self->new_stack('text');
$self->{sections} = [];
}
elsif ( $comment eq ' end body ' ) {
$self->eof;
}
}
#===================================
sub text {
#===================================
my ( $self, $text ) = @_;
return unless $self->{stack};
my $dest = $self->{stack}[-1][0];
return if $dest eq 'ignore';
return unless $text =~ /\S/;
$text =~ s/\s+/ /g;
$text =~ s/^ //;
$text =~ s/ $//;
$text =~ s/\x{2019}/'/g;
if ( $dest eq 'breadcrumbs' ) {
push @{ $self->{breadcrumbs} }, $text;
}
return unless @{ $self->{sections} };
push @{ $self->{sections}[-1]{$dest} }, $text;
}
#===================================
sub start {
#===================================
my ( $self, $tag, $attr ) = @_;
return unless $self->{stack};
my $current = $self->{stack}[-1];
# ignoring section
if ( $current->[0] eq 'ignore' ) {
push @{ $current->[1] }, $tag;
return;
}
my $class = $attr->{class} || '';
if ( $current->[0] eq 'title' ) {
if ( $tag eq 'a' ) {
$self->{sections}[-1]{id} = $attr->{id} if $attr->{id};
$self->new_stack( 'ignore', $tag ) if $class eq 'edit_me';
}
return;
}
if ( $tag eq 'div' ) {
return $self->new_stack('breadcrumbs')
if $class eq 'breadcrumbs';
return $self->new_stack( 'ignore', $tag )
if $class =~ 'navheader'
|| $class eq 'navfooter'
|| $class eq 'toc';
}
return $self->new_stack( 'ignore', $tag )
if $tag eq 'a' and $class =~ /(console|sense)_widget/;
if ( $tag =~ /^h\d/ ) {
$self->new_stack('title');
$self->new_section;
}
}
#===================================
sub new_stack { push @{ shift()->{stack} }, [ shift(), [@_] ] }
sub new_section { push @{ shift()->{sections} }, { title => [], text => [] } }
#===================================
#===================================
sub end {
#===================================
my ( $self, $tag ) = @_;
return unless $self->{stack};
my $current = $self->{stack}[-1];
if ( $current->[0] eq 'breadcrumbs' ) {
pop @{ $self->{stack} } if $tag eq 'div';
return;
}
if ( $current->[0] eq 'title' ) {
pop @{ $self->{stack} } if $tag =~ /^h\d/;
return;
}
return unless $current->[0] eq 'ignore';
while ( my $old = pop @{ $current->[-1] } ) {
last if $old eq $tag;
}
if ( @{ $current->[-1] } == 0 ) {
pop @{ $self->{stack} };
}
}
#===================================
sub output {
#===================================
my $self = shift;
my $breadcrumbs = join " ", @{ $self->{breadcrumbs} || [] };
my @sections;
for my $section ( @{ $self->{sections} } ) {
my $title = join( " ", @{ $section->{title} } );
my $text = join( " ", @{ $section->{text} } );
if ( $section->{id} && $section->{text} ) {
push @sections,
{
title => $title,
text => $text,
id => "#" . $section->{id}
};
}
else {
$sections[-1]{text} .= "\n\n$title\n\n$text";
}
}
$sections[0]{id} = '';
return { sections => \@sections, breadcrumbs => $breadcrumbs };
}
1;