#!/usr/bin/perl -w use strict; use HTML::TreeBuilder; use Data::Dumper; my $base = 'http://www.geekfarm.org/wu/muse/'; my $dir = '/Users/wu/Muse/Public/html/'; my $home = "WebHome.html"; my ( %queue, %seen ); $queue{ $home } = 1; while ( keys %queue ) { for my $key ( keys %queue ) { next unless $queue{ $key }; if ( $seen{ $key } ) { delete $queue{$key}; next; } process_page( $key ); delete $queue{$key}; } } for my $page ( keys %seen ) { print "$page "; for my $link ( keys %{ $seen{$page} } ) { print "$link "; } print "\n"; } sub process_page { my ( $page ) = @_; #print "Processing page: $page\n"; my $content = get_content( $page ); unless ( $content ) { #print "no content for page $page"; return; } for my $link ( get_links( $content ) ) { $seen{$page}->{$link}++; $queue{ $link } = 1; } } sub get_content { my ( $page ) = @_; my $path = "$dir/$page"; my $buffer; open(my $fh, "<", $path) or return; #die "Couldn't open $path for reading: $!\n"; while ( my $line = <$fh> ) { $buffer .= $line; } close $fh or return; #die "Error closing file: $!\n"; return $buffer; } sub get_links { my ( $content ) = @_; my $root = HTML::TreeBuilder->new; $root->parse( $content ); $root->eof(); my @links; for my $link ( $root->find_by_tag_name('a') ) { my $text = $link->{'href'}; next unless $text; # ignore external links for now next if $text =~ m|^https?\:\/\/|; next unless $text =~ m|html$|; next if $text =~ m|^ftp\:\/\/|; next if $text =~ m|^mailto\:|; next if $text =~ m|^\.\.\/|; $text =~ s|\#.*$||g; #print "\tFound link: $text\n"; push @links, $text; } return @links; }