#!/usr/bin/perl -w
use strict;
use HTML::TreeBuilder;
use Data::Dumper;
my $base = 'http://www.geekfarm.org/wu/muse/';
my $dir = '/Users/wu/Muse/Public/html/';
my $home = "WebHome.html";
my ( %queue, %seen );
$queue{ $home } = 1;
while ( keys %queue ) {
for my $key ( keys %queue ) {
next unless $queue{ $key };
if ( $seen{ $key } ) {
delete $queue{$key};
next;
}
process_page( $key );
delete $queue{$key};
}
}
for my $page ( keys %seen ) {
print "$page ";
for my $link ( keys %{ $seen{$page} } ) {
print "$link ";
}
print "\n";
}
sub process_page {
my ( $page ) = @_;
#print "Processing page: $page\n";
my $content = get_content( $page );
unless ( $content ) {
#print "no content for page $page";
return;
}
for my $link ( get_links( $content ) ) {
$seen{$page}->{$link}++;
$queue{ $link } = 1;
}
}
sub get_content {
my ( $page ) = @_;
my $path = "$dir/$page";
my $buffer;
open(my $fh, "<", $path)
or return; #die "Couldn't open $path for reading: $!\n";
while ( my $line = <$fh> ) {
$buffer .= $line;
}
close $fh or return; #die "Error closing file: $!\n";
return $buffer;
}
sub get_links {
my ( $content ) = @_;
my $root = HTML::TreeBuilder->new;
$root->parse( $content );
$root->eof();
my @links;
for my $link ( $root->find_by_tag_name('a') ) {
my $text = $link->{'href'};
next unless $text;
# ignore external links for now
next if $text =~ m|^https?\:\/\/|;
next unless $text =~ m|html$|;
next if $text =~ m|^ftp\:\/\/|;
next if $text =~ m|^mailto\:|;
next if $text =~ m|^\.\.\/|;
$text =~ s|\#.*$||g;
#print "\tFound link: $text\n";
push @links, $text;
}
return @links;
}