[mca1001] / cronutils / WWWnews-feed.pl  

mca1001: cronutils/WWWnews-feed.pl

File: [mca1001] / cronutils / WWWnews-feed.pl (download) (as text)
Revision: 1.2, Sun Oct 22 16:53:51 2000 UTC (9 years, 10 months ago) by mca1001
Branch: MAIN
CVS Tags: REL_2002_06_20, REL_2002_07_20, REL_2001_05_12, REL_2002_06_19, REL_2002_07_10, REL_2001_04_02, REL_2003_02_03, PUBLIC, HEAD
Changes since 1.1: +190 -129 lines
Huge chunks rewritten, now uses LWP
It's probably still flaky though. Also I notice it doesn't have the
'-w' flag enabled for perl...

#! /usr/bin/perl

# $Id: WWWnews-feed.pl,v 1.2 2000/10/22 16:53:51 mca1001 Exp $
# $Log: WWWnews-feed.pl,v $
# Revision 1.2  2000/10/22 16:53:51  mca1001
# Huge chunks rewritten, now uses LWP
# It's probably still flaky though. Also I notice it doesn't have the
# '-w' flag enabled for perl...
#

# WWWnews-feed          (c) 2000 Matthew Astley <matthew@fruitcake.demon.co.uk>
# Redistributable only under the GNU General Public Licence v2 or above
#
# This script will visit http://www.cam.ac.uk/cgi-bin/WWWnews?grp=ucam.blah , as
# defined in the .rc file below, and download all new articles. It outputs
# a file in "#! rnews" format to stdout, errors to stderr

# As it stands the articles may have ... extra stuff attached to them. I can
# strip this off easily enough but I'm not sure whether there are other
# different sysems out there (eg. dejanews) which will have different
# formatting.
#
# While I'm on the subject of portability, assumptions are currently made about
# the format of an article url in "fetch_group", this should probably be a regexp
# defined in the rc file. The "junk" after a [] line could be used for this...


# rc file is in the following format:
#
# [http://www.cam.ac.uk/cgi-bin/WWWnews?grp=*]
# ucam.adverts.forsale: 1400
# ucam.adverts.wanted:	1100


# The * is replaced with the group from each line that follows, but if you
# prefer you can say something like,
#
# [http://www.cam.ac.uk/cgi-bin/WWWnews?grp=ucam.adverts.forsale]
# localnet.uniforsale: 1400
# 
# [http://www.cam.ac.uk/cgi-bin/WWWnews?grp=ucam.adverts.wanted]
# localnet.uni.wanted:	1100


# Blank lines and comments (beginning with #) are ignored but preserved.
# Bear in mind that if you run two copies of this script on the same file the
# last one to save the rc file "wins" 'cos I can't be bothered to do file
# locking.
#
# After the index file containing the article numbers has been fetched, the
# URLs in that index are assumed to be absolute. If they're not, you'd better
# start hacking.


warn "sort fetch order ascending numeric, save and restart from index on failure";

use strict;
use vars qw($verby);
$verby = 1;

die "\$HOME is not defined" unless $ENV{'HOME'};
my $rcfile = "$ENV{'HOME'}/.wwwnewsrc";
my $outputrc = "";  # new copy of rc file
my $site = "";      # bit between the most recent []s

die "Failed to open '$rcfile'" unless open RC, "$rcfile";

while (<RC>)
{
    chomp;

    if ( /^$/ or /^\#/ ) {
	$outputrc .= "$_\n";
    } elsif ( /^\[(.+)\](.*)$/ ) {
	# we have a site,   [$1]$2
	warn "Ignoring junk after \[], \'$2\' at line $.\n" unless $2 eq "";
	if ( $1 =~ /\s/ ) {
	    warn "Ignoring site with whitespace '$1' at line $.\n";
	} else {
	    $site = $1;
	}
	$outputrc .= "$_\n";
    } elsif ( /^([-+.0-9A-Z_a-z~]+):\s*(\d*)$/ ) {
	# a newsgroup line,   $1: $2
	my ($grp, $lastnum) = ($1, $2);

	# tradition has it (at least, !NewsHound does) that you start
	# from the next number after the one in the file. Can't be
	# bothered with finding and RFC or wotnot.

	my $numstart = ($2 == 0) ? 1 : $2 + 1;
	my $numend = fetch_group($site, $grp, $numstart);
	$numend = $lastnum if $numend == 0;   # nothing fetched

	$outputrc .= "$1: $numend\n";
    } else {
	# non comprendi
	warn "I ignore your silly input line '$_', number $.\n";
	$outputrc .= "$_\n";
    }
}
close RC;

die "Failed to open rc file for writing, please write it for me:\n$outputrc\n"
    unless open RC, ">$rcfile";
print RC $outputrc;
close RC;

exit 0;

############################################################################

sub fetch_group # args = index URL, group, startnum
#| Fetch the index of the group and dump out all the new articles
# Returns last article number fetched
{
    my ($site, $grp, $numstart) = (shift, shift, shift);
    my $lastnum = 0;

    $site =~ s/\*/$grp/;   # doesn't matter if there isn't one

    warn "\nFetching index for $grp\n";
    my $index_html = FetchURL($site);
# my $index_html = `type Homes:mca1001.gdl.wwwnewsbit`;
# my $index_html = "foo href=\"http://blah/art=484\">wibble";  $?=0;

    unless (defined $index_html) {
	warn "Fetch failed for $site\n";
	return 0;
    }

    my %arts;
    while ($index_html =~ /href="(http:[^\s]+art=)(\d+)">/g) {    # an article url
	$arts{$2} = "$1$2" if $2 >= $numstart;
	# stash the URL because if we try to reconstruct it later,
	# we're going to end up hacking horribly when something
	# changes...

    }

    foreach my $art (sort {$a <=> $b} keys %arts) { # in ascending order, so if we bomb out we can continue without missing anything
	if (dump_article ($arts{$art}, $art, $grp )) {
	    $lastnum = $art if $art > $lastnum; # mark as fetched if it succeeds
	} else {
	    last; # abort the fetch if it failed - we haven't marked as read anything we couldn't read
	}
    }

    return $lastnum;
}



sub unwebbify # should probably use HTML::Entities instead...
{
$_ = @_[0];  # ignore other args

  s/\&gt\;/>/g;
  s/\&lt\;/</g;
  s/\&quot\;/\"/g;
  s/\&amp\;/\&/g;    # do this one last!

return $_;
}



sub dump_article # args = (url, article number, groupname)
#| Fetch specified article & dump to STDOUT
# Returns 1 = success, 0 = fail
{
    my ($url, $artnum, $grp) = (shift, shift, shift);
    my $whole_art = "";
    my @art;

    my $site = ($url =~ m@://([^\\/ :]+)@  ? $1 : "unknown.web.site");

    warn "Fetching article $artnum\n";

    {
	my $htmlvsn = FetchURL($url);
	return 0 unless defined $htmlvsn;
	@art = split '\n', $htmlvsn;
    }

    $whole_art  = "Path: webnews-fetcher!$site
Message-ID: <fakeID-$grp-$artnum\@$site>\n";

    my $dumpy = 0;
    while (defined($_ = shift @art) and $dumpy < 2) {
	# hack out headers and text
	if ( $dumpy == 1 ) {
	    if ( /^\<\/pre\>$/ ) {
		$dumpy=2;
	    } else {
		$whole_art .= unwebbify ( "$_\n" );
	    }
	} elsif ( /^\<tr\>\<td align=\"left\">([^\s:]+): \<\/td\>\<td align=\"left\"\>(.*)$/ ) {
	    # got a header line
	    $whole_art .= unwebbify ( "$1: $2\n" );
	} elsif ( /^\<pre\>$/ ) {
	    $whole_art .= "\n";
	    $dumpy = 1;
	}
    }

    $whole_art .= "\n";
    print "#! rnews ", length $whole_art, "\n$whole_art";
    return 1;
}


sub FetchURL # args = (URL)
# Returns document, or undef for failure
# Sends error details to STDERR
{
    my $url = shift;

    print STDERR "Fetching $url...\n"
	if $verby > 1;

#    my $buff = `lynx -mime_header \Q$url`;
#    my ($err_num, $err_str) = ($?, $!);


    # Create a user agent object
    use LWP::UserAgent;
    my $ua = new LWP::UserAgent;
    $ua->agent("WWWnews-feed/0.1 " . $ua->agent);
    $ua->env_proxy();

    my $req = new HTTP::Request GET => $url;	# Create a request
    my $res = $ua->request($req);		# Pass request to the user agent and get a response back

    # Check the outcome of the response
    unless ($res->is_success) {
	print STDERR "Failed fetching $url: ".($res->status_line())."\n";
	return undef;
    }
    my $buff = $res->content();
    my $headers = $res->headers_as_string();

    # check we didn't provoke the wwwoffled
    if ($headers =~ /wwwoffle/i) {
	print STDERR "$0: wwwoffle intercepted: $url\n";
	return undef;
    }

#    # check we got an HTTP header
#    if (not $headers =~ /^HTTP  (?:\/[\d.]+)?  \s+  (\d+) (\s+.+)? \s*\n/x ){
#	$headers =~ s#[\n\r]#/#;  # kill the linefeeds
#	print STDERR "No HTTP fetch code for $url, headers='$headers'";
#	return undef;
#    }
#    if ($1 != 200) { # the HTTP answer code
#	print STDERR "Received 'HTTP $1 $2$3' for '$url' : headers = >>$headers<<\n";
#	return undef;
#    }

# extract the last-modified: date or whatever here...

    if ($headers =~ /^CONTENT-LENGTH:\s+(\d+)/im) {
	my $exp_len = $1;
	if ($exp_len != length $buff) {
	    print STDERR "Content of $url is ", (length $buff)," bytes but should be $exp_len\n";
#	    my $buff = `lynx -source \Q$url`;
#	    my ($err_num, $err_str) = ($?, $!);
#	    if ($? == 0 and length($buff) == $exp_len) {
#		$headers = "[no headers on second fetch]\n";
#		print STDERR "[but it looks OK on the second pass. Lynx is weird\n";
#	    } else {
		return undef;
#	    }
	}
    }

    if ($headers =~ m@^CONTENT-TYPE:\s+TEXT/HTML@im) {
	# fix the \r\n's
	$buff =~ s/\r\n/\n/g; # turn \r\n into \n  (DOS html files, ick)
	$buff =~ s/\r/\n/g;   # CRs are nasty anyway. Let's kill them!
    }

    print STDERR "Fetched $url\n"
	if $verby > 1;

    return $buff;
}

Repository owner

Powered by ViewCVS 1.0-dev
(Powered by ViewCVS)

ViewCVS and CVS Help