Hello tjsnod,
I couldn't find any script that exactly matched this purpose, so here
is a copy of a little script I wrote up to do this purpose.
To see this script in action simply visit here. (Or pick another
page..)
http://fusion.phpwebhosting.com/cgi-bin/lwp-simple.pl?page=www.economist.com
The main two limitations on this script is that it doesn't deal well
with heavily formatted links, and that it doesn't deal with pages with
frames.
You will have to change the perl directory to your own, if necessary,
but I'm sure you know that.
If you have LWP, you SHOULD have HTML::TokeParser. But, if you don't,
you will have to install it.
This script should be placed in your cgi-bin directory, and you should
give executable access to the web server process.
If there's anything you need from me, or something you're confused
about, don't hesitate to ask.
Thanks,
Todd
The script follows:
#!/usr/bin/perl
# By Todd Alverson (Todd@fusion.phpwebhosting.com)
use LWP;
require HTML::TokeParser;
print "Content-type: text/plain\n\n";
print "<HTML>\n<BODY>\n";
# Parse the form input
&parse_form;
# Get the page and parse it
&parse_page;
print "</BODY>\n</HTML>\n";
exit;
sub parse_form {
# Get the input
if ($ENV{'REQUEST_METHOD'} eq "POST") {
read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
} elsif ($ENV{'REQUEST_METHOD'} eq "GET") {
$buffer = $ENV{'QUERY_STRING'};
}
# Split the name-value pairs
@pairs = split(/&/, $buffer);
foreach $pair (@pairs) {
($name, $value) = split(/=/, $pair);
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$FORM{$name} = $value;
}
}
sub parse_page {
# Make a new UserAgent
$ua=new LWP::UserAgent;
$ua->agent("Mozilla/4.0 (compatible; MSIE 4.01; Windows 98)");
# httpize it
$pageAddr = "http://" . $FORM{page};
print "Getting $pageAddr<BR> \n";
my $req = new HTTP::Request GET => $pageAddr;
# Request the page
my $res=$ua->request($req);
# Successful request?
if ($res->is_success)
{
# Start a new parser
$p=HTML::TokeParser->new(\$res->content) || die;
# Parse each token on the page
while (my @token = @{$p->get_token}) {
# Find the next <A> tag
if ($token[0] eq "S" && $token[1] eq "a")
{
$link=$token[2]{"href"};
# If it's a relative link, make it absolute
if (substr($link,0,1) eq "/") {
$link = $pageAddr . $link;
}
# Now get the text
$desc = $p->get_trimmed_text('/a');
# Parse it if it's img text...
if ($desc =~ /img/isg) {
if
($desc=~/ALT\s*=\s*([\"\'])(.*?)\1/i
$desc = $2;
} else {
$desc = "";
}
}
# If there's no description, make it say so
if ($desc eq "") {
$desc = "No Alt Text";
}
# Print the link
print "<A HREF=\"$link\">" . $desc .
"</A><BR>\
}
}
} else {
print "Failed to retrieve " . $pageAddr;
}
} |
Clarification of Answer by
coldfusion-ga
on
20 Sep 2002 11:06 PDT
My apologies, two lines were cut off when that was pasted.
For simplicity, I'll just repaste the corrected script.
#!/usr/bin/perl
# By Todd Alverson (Todd@fusion.phpwebhosting.com)
use LWP;
require HTML::TokeParser;
print "Content-type: text/plain\n\n";
print "<HTML>\n<BODY>\n";
# Parse the form input
&parse_form;
# Get the page and parse it
&parse_page;
print "</BODY>\n</HTML>\n";
exit;
sub parse_form {
# Get the input
if ($ENV{'REQUEST_METHOD'} eq "POST") {
read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
} elsif ($ENV{'REQUEST_METHOD'} eq "GET") {
$buffer = $ENV{'QUERY_STRING'};
}
# Split the name-value pairs
@pairs = split(/&/, $buffer);
foreach $pair (@pairs) {
($name, $value) = split(/=/, $pair);
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$FORM{$name} = $value;
}
}
sub parse_page {
# Make a new UserAgent
$ua=new LWP::UserAgent;
$ua->agent("Mozilla/4.0 (compatible; MSIE 4.01; Windows 98)");
# httpize it
$pageAddr = "http://" . $FORM{page};
print "Getting $pageAddr<BR> \n";
my $req = new HTTP::Request GET => $pageAddr;
# Request the page
my $res=$ua->request($req);
# Successful request?
if ($res->is_success)
{
# Start a new parser
$p=HTML::TokeParser->new(\$res->content) || die;
# Parse each token on the page
while (my @token = @{$p->get_token}) {
# Find the next <A> tag
if ($token[0] eq "S" && $token[1] eq "a")
{
$link=$token[2]{"href"};
# If it's a relative link, make it absolute
if (substr($link,0,1) eq "/" &&
substr($link,0,2) ne "//") {
$link = $pageAddr . $link;
}
# Now get the text
$desc = $p->get_trimmed_text('/a');
# Parse it if it's img text...
if ($desc =~ /img/isg) {
if
($desc=~/ALT\s*=\s*([\"\'])(.*?)\1/is)
$desc = $2;
} else {
$desc = "";
}
}
# If there's no description, make it say so
if ($desc eq "") {
$desc = "No Alt Text";
}
# Print the link
print "<A HREF=\"$link\">" . $desc .
"</A><BR>\n";
}
}
} else {
print "Failed to retrieve " . $pageAddr;
}
}
|