Huge chunks rewritten, now uses LWP It's probably still flaky though. Also I notice it doesn't have the '-w' flag enabled for perl...
#! /usr/bin/perl
# $Id: WWWnews-feed.pl,v 1.2 2000/10/22 16:53:51 mca1001 Exp $
# $Log: WWWnews-feed.pl,v $
# Revision 1.2 2000/10/22 16:53:51 mca1001
# Huge chunks rewritten, now uses LWP
# It's probably still flaky though. Also I notice it doesn't have the
# '-w' flag enabled for perl...
#
# WWWnews-feed (c) 2000 Matthew Astley <matthew@fruitcake.demon.co.uk>
# Redistributable only under the GNU General Public Licence v2 or above
#
# This script will visit http://www.cam.ac.uk/cgi-bin/WWWnews?grp=ucam.blah , as
# defined in the .rc file below, and download all new articles. It outputs
# a file in "#! rnews" format to stdout, errors to stderr
# As it stands the articles may have ... extra stuff attached to them. I can
# strip this off easily enough but I'm not sure whether there are other
# different sysems out there (eg. dejanews) which will have different
# formatting.
#
# While I'm on the subject of portability, assumptions are currently made about
# the format of an article url in "fetch_group", this should probably be a regexp
# defined in the rc file. The "junk" after a [] line could be used for this...
# rc file is in the following format:
#
# [http://www.cam.ac.uk/cgi-bin/WWWnews?grp=*]
# ucam.adverts.forsale: 1400
# ucam.adverts.wanted: 1100
# The * is replaced with the group from each line that follows, but if you
# prefer you can say something like,
#
# [http://www.cam.ac.uk/cgi-bin/WWWnews?grp=ucam.adverts.forsale]
# localnet.uniforsale: 1400
#
# [http://www.cam.ac.uk/cgi-bin/WWWnews?grp=ucam.adverts.wanted]
# localnet.uni.wanted: 1100
# Blank lines and comments (beginning with #) are ignored but preserved.
# Bear in mind that if you run two copies of this script on the same file the
# last one to save the rc file "wins" 'cos I can't be bothered to do file
# locking.
#
# After the index file containing the article numbers has been fetched, the
# URLs in that index are assumed to be absolute. If they're not, you'd better
# start hacking.
warn "sort fetch order ascending numeric, save and restart from index on failure";
use strict;
use vars qw($verby);
$verby = 1;
die "\$HOME is not defined" unless $ENV{'HOME'};
my $rcfile = "$ENV{'HOME'}/.wwwnewsrc";
my $outputrc = ""; # new copy of rc file
my $site = ""; # bit between the most recent []s
die "Failed to open '$rcfile'" unless open RC, "$rcfile";
while (<RC>)
{
chomp;
if ( /^$/ or /^\#/ ) {
$outputrc .= "$_\n";
} elsif ( /^\[(.+)\](.*)$/ ) {
# we have a site, [$1]$2
warn "Ignoring junk after \[], \'$2\' at line $.\n" unless $2 eq "";
if ( $1 =~ /\s/ ) {
warn "Ignoring site with whitespace '$1' at line $.\n";
} else {
$site = $1;
}
$outputrc .= "$_\n";
} elsif ( /^([-+.0-9A-Z_a-z~]+):\s*(\d*)$/ ) {
# a newsgroup line, $1: $2
my ($grp, $lastnum) = ($1, $2);
# tradition has it (at least, !NewsHound does) that you start
# from the next number after the one in the file. Can't be
# bothered with finding and RFC or wotnot.
my $numstart = ($2 == 0) ? 1 : $2 + 1;
my $numend = fetch_group($site, $grp, $numstart);
$numend = $lastnum if $numend == 0; # nothing fetched
$outputrc .= "$1: $numend\n";
} else {
# non comprendi
warn "I ignore your silly input line '$_', number $.\n";
$outputrc .= "$_\n";
}
}
close RC;
die "Failed to open rc file for writing, please write it for me:\n$outputrc\n"
unless open RC, ">$rcfile";
print RC $outputrc;
close RC;
exit 0;
############################################################################
sub fetch_group # args = index URL, group, startnum
#| Fetch the index of the group and dump out all the new articles
# Returns last article number fetched
{
my ($site, $grp, $numstart) = (shift, shift, shift);
my $lastnum = 0;
$site =~ s/\*/$grp/; # doesn't matter if there isn't one
warn "\nFetching index for $grp\n";
my $index_html = FetchURL($site);
# my $index_html = `type Homes:mca1001.gdl.wwwnewsbit`;
# my $index_html = "foo href=\"http://blah/art=484\">wibble"; $?=0;
unless (defined $index_html) {
warn "Fetch failed for $site\n";
return 0;
}
my %arts;
while ($index_html =~ /href="(http:[^\s]+art=)(\d+)">/g) { # an article url
$arts{$2} = "$1$2" if $2 >= $numstart;
# stash the URL because if we try to reconstruct it later,
# we're going to end up hacking horribly when something
# changes...
}
foreach my $art (sort {$a <=> $b} keys %arts) { # in ascending order, so if we bomb out we can continue without missing anything
if (dump_article ($arts{$art}, $art, $grp )) {
$lastnum = $art if $art > $lastnum; # mark as fetched if it succeeds
} else {
last; # abort the fetch if it failed - we haven't marked as read anything we couldn't read
}
}
return $lastnum;
}
sub unwebbify # should probably use HTML::Entities instead...
{
$_ = @_[0]; # ignore other args
s/\>\;/>/g;
s/\<\;/</g;
s/\"\;/\"/g;
s/\&\;/\&/g; # do this one last!
return $_;
}
sub dump_article # args = (url, article number, groupname)
#| Fetch specified article & dump to STDOUT
# Returns 1 = success, 0 = fail
{
my ($url, $artnum, $grp) = (shift, shift, shift);
my $whole_art = "";
my @art;
my $site = ($url =~ m@://([^\\/ :]+)@ ? $1 : "unknown.web.site");
warn "Fetching article $artnum\n";
{
my $htmlvsn = FetchURL($url);
return 0 unless defined $htmlvsn;
@art = split '\n', $htmlvsn;
}
$whole_art = "Path: webnews-fetcher!$site
Message-ID: <fakeID-$grp-$artnum\@$site>\n";
my $dumpy = 0;
while (defined($_ = shift @art) and $dumpy < 2) {
# hack out headers and text
if ( $dumpy == 1 ) {
if ( /^\<\/pre\>$/ ) {
$dumpy=2;
} else {
$whole_art .= unwebbify ( "$_\n" );
}
} elsif ( /^\<tr\>\<td align=\"left\">([^\s:]+): \<\/td\>\<td align=\"left\"\>(.*)$/ ) {
# got a header line
$whole_art .= unwebbify ( "$1: $2\n" );
} elsif ( /^\<pre\>$/ ) {
$whole_art .= "\n";
$dumpy = 1;
}
}
$whole_art .= "\n";
print "#! rnews ", length $whole_art, "\n$whole_art";
return 1;
}
sub FetchURL # args = (URL)
# Returns document, or undef for failure
# Sends error details to STDERR
{
my $url = shift;
print STDERR "Fetching $url...\n"
if $verby > 1;
# my $buff = `lynx -mime_header \Q$url`;
# my ($err_num, $err_str) = ($?, $!);
# Create a user agent object
use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->agent("WWWnews-feed/0.1 " . $ua->agent);
$ua->env_proxy();
my $req = new HTTP::Request GET => $url; # Create a request
my $res = $ua->request($req); # Pass request to the user agent and get a response back
# Check the outcome of the response
unless ($res->is_success) {
print STDERR "Failed fetching $url: ".($res->status_line())."\n";
return undef;
}
my $buff = $res->content();
my $headers = $res->headers_as_string();
# check we didn't provoke the wwwoffled
if ($headers =~ /wwwoffle/i) {
print STDERR "$0: wwwoffle intercepted: $url\n";
return undef;
}
# # check we got an HTTP header
# if (not $headers =~ /^HTTP (?:\/[\d.]+)? \s+ (\d+) (\s+.+)? \s*\n/x ){
# $headers =~ s#[\n\r]#/#; # kill the linefeeds
# print STDERR "No HTTP fetch code for $url, headers='$headers'";
# return undef;
# }
# if ($1 != 200) { # the HTTP answer code
# print STDERR "Received 'HTTP $1 $2$3' for '$url' : headers = >>$headers<<\n";
# return undef;
# }
# extract the last-modified: date or whatever here...
if ($headers =~ /^CONTENT-LENGTH:\s+(\d+)/im) {
my $exp_len = $1;
if ($exp_len != length $buff) {
print STDERR "Content of $url is ", (length $buff)," bytes but should be $exp_len\n";
# my $buff = `lynx -source \Q$url`;
# my ($err_num, $err_str) = ($?, $!);
# if ($? == 0 and length($buff) == $exp_len) {
# $headers = "[no headers on second fetch]\n";
# print STDERR "[but it looks OK on the second pass. Lynx is weird\n";
# } else {
return undef;
# }
}
}
if ($headers =~ m@^CONTENT-TYPE:\s+TEXT/HTML@im) {
# fix the \r\n's
$buff =~ s/\r\n/\n/g; # turn \r\n into \n (DOS html files, ick)
$buff =~ s/\r/\n/g; # CRs are nasty anyway. Let's kill them!
}
print STDERR "Fetched $url\n"
if $verby > 1;
return $buff;
}
|
Repository owner Powered by ViewCVS 1.0-dev |
ViewCVS and CVS Help |