#!/usr/bin/perl

#-------------------------------------------------------------
# Copyright (C) 2001 by Daniel Käps.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
# 
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# 
# If you do not have a copy of the GNU General Public License write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#-------------------------------------------------------------


#-------------------------------------------------------------

use 5.005;
use strict;
use Getopt::Long;
use File::Find;
use Pod::Text;

#-------------------------------------------------------------
# variables without corresponding command line option:

my($WwwoffleCachePrefix)="/var/cache/wwwoffle";
my($ProgramVersionString)="1.0.0b-02 (2002-10-08)";

#-------------------------------------------------------------

my %Options;

if (! GetOptions(\%Options,
      "url-filename|u",
      "data-filename|d",
      "no-url|n",
      "base-dir-pattern=s",
      "server-base-dirs",
      "non-server-base-dirs",
      "all-base-dirs",
      "add-server-directory-entries|a",
      "help|h",
      "manual|man",
      "version|V"
    ))
{
    system ("pod2text '$0' | sed /^FILES/,\\\$d >&2");
    exit 1;
}

#-------------------------------------------------------------

if ($Options{manual})
  {
    system ("pod2text '$0'");
    exit 1;
  }
if ($Options{help})
  {
    system ("pod2text '$0' | sed /^FILES/,\\\$d");
    exit 1;
  }
if ($Options{version})
  {
    print "$ProgramVersionString\n\n";
    exit 1;
  }

#-------------------------------------------------------------

## BUG add correct parsing for command line arguments: explicit server part spec.,
## find arguments, call back script
## maybe change names: server/non-server dirs --> store/management dirs, resp.

my($URLPattern) = @ARGV;
## print "$URLPattern\n";

my($IsPrintURL)=! $Options{"no-url"};
my($IsPrintURLFileName)=$Options{"url-filename"};
my($IsPrintDataFileName)=$Options{"data-filename"};

# directly defer location of files in the server dirs (ftp|http) from URL files located in 
# lasttime|prevtime[1-3]|outgoing|monitor (e.g. for newly downloaded files, the URL and the
# (HTTP)data files will be identical copies, and the file names are the also same)
my($IsAddServerDirectoryEntries)=$Options{"add-server-directory-entries"};

my($ServerDirectoryBasePattern)="http|ftp|finger";
my($NonServerDirectoryBasePattern)="outgoing|monitor|lasttime|prevtime[1-3]";
my($AllBaseDirectoryPattern)="$ServerDirectoryBasePattern|$NonServerDirectoryBasePattern";

my($WwwoffleBasePattern) = do {
     ($Options{"server-base-dirs"})      ? $ServerDirectoryBasePattern :
     ($Options{"non-server-base-dirs"})  ? $NonServerDirectoryBasePattern :
     ($Options{"all-base-dirs"})         ? $AllBaseDirectoryPattern :
     ($Options{"base-dir-pattern"})      ? $Options{"base-dir-pattern"} :
         $AllBaseDirectoryPattern
      };

#-------------------------------------------------------------

if (! $IsPrintURL)
  {
    print "# BUG not printing URLs not yet implemented, sorry\n";
    exit 1;
  }

#=============================================================

## print "$WwwoffleBasePattern\n";

## $| = 1;                         # so we can see it run
find(\&FindFileStep, "$WwwoffleCachePrefix");

sub FindFileStep
{

    ## -d and print "dir: $_\n";
    if (-d)
     {
       if (
         ## ADDED the following construct:
         $File::Find::name =~ "^$WwwoffleCachePrefix\$")
        {
          return;
        }

       if (
         ## CHANGED following construct:
         $File::Find::name !~ "^$WwwoffleCachePrefix/($WwwoffleBasePattern)\$" &&
         $File::Find::name !~ "^$WwwoffleCachePrefix/($WwwoffleBasePattern)/.*\$" )
        {
          ## print "pruned: $File::Find::name\n";
          $File::Find::prune = 1;
          return;
        }

       # is current dir one of the server directories (those that are located under ftp or http)?
       # (if-condition assumes that they don't have any subdirectories)
       if ($File::Find::name =~ "^$WwwoffleCachePrefix/($ServerDirectoryBasePattern)/.*")
        {
          # get the last component of the directory directly above - this is the protocol
          my($Protocol) = $File::Find::dir;
          $Protocol =~ s,.*/,,g;
          my($URLSpace)= "$Protocol://$_/"; # construct a URL from it

          ## BUG: how is it possible to extract the server part of a URL given as
          ## a regexp??
          my($URLPatternServerPart) = $URLPattern;

          ## print "check: $File::Find::name - $URLSpace - $URLPattern -.\n";

          # check whether the current directory/$URLSpace matches the search pattern
          if ($URLSpace !~ $URLPatternServerPart)
            {
              ## print "pruned: $File::Find::name\n";
              $File::Find::prune = 1;
              return;
            }
        }
       else
        {
          ## ADDED (here we should probably do a return)
          return;
        }

       ## print "dir: $_\n";
     }

    #-------------------------------------------------------------

    # if we don't have a file or if we have a file but it is not a wwwoffle
    # URL file (they look like U... as opposed to e.g. data files which look like D...)
    # we do a return
    -f and /^U.*$/ or return;

    #-------------------------------------------------------------

    open (URLFile, $_) or return;
    my($URL)=<URLFile>;

    if ($URL =~ $URLPattern)
     {
       print "$File::Find::name " if ($IsPrintURLFileName);

       my($DataFileName);
       if ($IsPrintDataFileName)
        {
          $DataFileName=$_;
          $DataFileName =~ s/^U/D/;
          print "$File::Find::dir/$DataFileName ";
        }

       print "$URL\n";
       if ($IsAddServerDirectoryEntries)
        {
          if ($File::Find::name =~ "$WwwoffleCachePrefix/($NonServerDirectoryBasePattern)/.*")
            {
             my($Protocol)=$URL;
             $Protocol =~ s,([a-zA-Z]+)://.*,$1,g;
             my($ServerName)=$URL;
             $ServerName =~ s,[a-zA-Z]+://([^/]+)/.*,$1,g;
             print "$WwwoffleCachePrefix/$Protocol/$ServerName/$_ " if ($IsPrintURLFileName);
             if ($IsPrintDataFileName)
               {
                print "$WwwoffleCachePrefix/$Protocol/$ServerName/$DataFileName ";
               }
             print "$URL\n";
            }
        }
     }
    close URLFile;

}

#=============================================================

__END__

=pod

=head1 NAME

wwwoffle-list - lists URL's and associated filenames residing in a wwwoffle cache

=head1 SYNOPSIS

wwwoffle-list [I<options>] I<URL-PATTERN>

=head1 DESCRIPTION

B<wwwoffle-list> lists URL's along with their associated filenames
which are residing in a wwwoffle cache.  It will descend only to those
directories that match I<URL-PATTERN> to make the search more rapid.

=head1 OPTIONS

=over 4

=item B<-u>, B<--url-filename>

print filename of URL file

=item B<-d>, B<--data-filename>

print filename of data/http file

=item B<-n>, B<--no-url>

don't print URL

=item B<--base-dir-pattern>=I<REGEXP>

search base dirs specified by I<REGEXP>

=item B<--server-base-dirs>

search only the base directories I<http>, I<ftp>, I<finger>

=item B<--non-server-base-dirs>

search only the base directories I<lasttime>, I<prevtime 1-3>, I<outgoing>,
I<monitor>

=item B<--all-base-dirs>

search all base directories

=item B<-a>, B<--add-server-directory-entries>

when searching through the non-server base directories (I<lasttime>,
I<prevtime 1-3>, I<outgoing>, I<monitor>) this will also print
information about their corresponding files in the server directories
(I<http>, I<ftp>, I<finger>)

=item B<-h>, B<--help>

display short help text and exit

=item B<--man>, B<--manual>

display complete help with examples etc. and exit

=item B<-V>, B<--version>

output version information and exit

=back

=head1 FILES

=over 4

=item C</var/cache/wwwoffle> - wwwoffle directory prefix

=back

=for nothing
  =head1 DIAGNOSTICS

=head1 BUGS

=over 4

=item Error handling is incomplete

=item emit an error for an empty URL-PATTERN argument

=item treat files in the outgoing and monitor directory different: they 
have no /D... (data) files but /O... (outgoing headers, form data) files

=item rename ``pattern'' to ``regexp'' where appropriate

=item (for more bugs, see the annotations in the source code)

=back

=head1 SEE ALSO

wwwoffle(1), perldoc wwwoffle-decompress.

=head1 AUTHOR

Daniel Käps (kaeps AT informatik.uni-leipzig.de)

=head1 COPYRIGHT AND DISCLAIMER

This is free software; see the source for copying conditions. 
There is NO warranty; not even for MERCHANTABILITY or FITNESS 
FOR A PARTICULAR PURPOSE.

=cut
