#!/usr/bin/perl

# WHAT?
# Generate rewrites for a zillion (sub)sites hosted on DTU but
# registered elsewhere.
#
# USAGE
# cp /etc/httpd/conf.d/redirectlist.conf /etc/httpd/conf.d/redirectlist.conf.sav  # hey!
# dtu-redirect-rewrite.pl <dtu-redirects.txt >/etc/httpd/conf.d/redirectlist.conf
# service httpd reload
#
# TRANSFORMATION
# http://nexmap.fysik.dtu.dk/English.aspx\thttp://www.fysik.dtu.dk/english/Research/nexmap
#   ==>
# RewriteRule ^http://nexmap\.fysik\.dtu\.dk/English.aspx$ http://www.fysik.dtu.dk/english/Research/nexmap [R]
#

our $Version = '1.1.0';

use 5.10.0;
use warnings;
use Getopt::Long;
use IO::File;
use File::stat;
use File::Copy;

Getopt::Long::Configure ("bundling");

our $infile;
our $outfile;
our $verbose = 0;
our $update = 0;
our $backup = 0;   # nr. of backups of outfile to keep
GetOptions(
           "infile|i:s"                  => \$infile,
           "outfile|o:s"                 => \$outfile,
           "verbose|v"                   => \$verbose,
           "update|u"                    => \$update,
           "backup|b:i"                  => \$backup
          );

say STDERR "dtu-redirect-rewrite $Version" if $verbose;

if($update and not($infile and $outfile)) {
  dodie("--update must be combined with --infile and --outfile", 2);
}

if($infile and !-r $infile) {
  dodie("infile [$infile] does not exist or is unreadable", 3);
}

my($inh, $outh, $indate, $outdate);

if($update and -r $outfile) {
  $indate = stat($infile)->mtime;
  $outdate = stat($outfile)->mtime;
  if($outdate >= $indate) {
    warn "No action: outfile is newer than infile and --update is in effect" if $verbose;
    exit 1;
  }
}

if($infile) {
  $inh = IO::File->new($infile, 'r');
} else {
  $inh = *STDIN;   # yes, this works
}

if($backup and $outfile and -e $outfile) {
  rotfile($outfile, $backup);
}

if($outfile) {
  $outh = IO::File->new($outfile, 'w');
  dodie("Could not open outfile [$outfile]", 4) unless $outh;
  # TODO report error
} else {
  $outh = *STDOUT;
}

$outh->say("# This file is generated by dtu-redirect-rewrite. Do not edit.");

while(my $l = <$inh>) {
  chomp $l; $l =~ s/\r//;    # might be DOS CRLF
  $l =~ s/^\s+//;
  next if $l =~ /^$/;
  next if $l =~ /^#/;

  my($from, $to) = split('\s+', $l);

  $from =~ s/^http(s)?:\/\///;
  next if $from =~ m|^(www\.)?dtu\.dk|;   # we cannot handle dtu.dk subsites
  $from =~ s/\./\\./g;

  $to = "http://$to" unless $to =~ /^http(s)?:\/\//;

  $outh->say('RewriteCond %{HTTP_HOST}%{REQUEST_URI} ' . $from . '(.*)');
  $outh->say('RewriteRule ^(.*)$ ' . $to . '%1 [L,NE,R=302]');
}

exit 0;

###
##################################################################
###

sub rotfile {
  my($file, $maxgen) = @_;

  return unless -e $file;

  for(my $i = $maxgen - 1; $i >= 1; $i--) {
    my $nxt = $i+1;
    rename("$file.$i", "$file.$nxt") if -e "$file.$i";
  }
  copy($file, "$file.1"); # never delete original!
  return 1;
}


sub dodie {
  my($msg, $rc) = @_;
  warn $msg;
  exit $rc;
}

# Create manpage with
# pod2man -c' ' -n 'DTU-REDIRECT-REWRITE' dtu-redirect-rewrite.pl |nroff -mman

=pod

=head1 NAME

dtu-redirect-rewrite - generate Apache rewrite rules for redirecting DTU websites

=head1 SYNOPSIS

B<dtu-redirect-rewrite> [I<options>] [-i I<file.txt>] [-o I<file.conf>]

e.g.

dtu-redirect-rewrite --backup 5 -i redirectlist.txt -o redirectlist.conf

=head1 DESCRIPTION

B<dtu-redirect-rewrite> takes an input file of the form:

  www.cinf.dtu.dk	www.fysik.dtu.dk/english/Research/cinf

and produces corresponding Apache rewrite rulesets like:

  RewriteCond %{HTTP_HOST}%{REQUEST_URI} www\.cinf\.dtu\.dk(.*)
  RewriteRule ^(.*)$ http://www.fysik.dtu.dk/english/Research/cinf$1 [L,R=302]

Given that the first site's hostname points to redirector.ait.dtu.dk,
Apache catches any URL to that host and redirects it permanently to
the site after the space.

Columns in the input file are space-delimited; multiple spaces or tabs
permitted. Lines starting with a hashmark (#) are ignored. URLs should
not start with a method (HTTP:// or HTTPS://). Hostnames containing
UTF-8 must be properly encoded following IDN rules:
http://en.wikipedia.org/wiki/Internationalized_domain_name

=head1 OPTIONS

=over 4

=item -i, --infile=FILE

Input file containing whitespace-delimited pairs of URL-to-catch,
URL-to-direct-to. If -i is not specified, STDIN is assumed.

=item -o, --outfile=FILE

Output file to contain the Apache directives. These files typically go
into /etc/{apache2|httpd}/conf.d end have a .conf extension.  This
file must be included from a VirtualHost in redirector.conf to be
processed in the proper time.

If not specified, STDOUT is assumed.

=item -b, --backup=MAXGEN

Keep a numbered backup of the outfile, if one exists already; if -b is
specified, the maximum number of backups to keep must be
given. Backups of e.g. list.conf are named list.conf.1, list.conf.2
etc. Ignored if output goes to STDOUT.

=item -u, --update

Only overwrite existing outfile if it's older than infile. Must be
used with -i and -o.

=item -v, --verbose

Says something about what's happening (to STDERR, so this will be
mailed if drr is run from a cronjob).

=back

=head1 EXIT CODES

Dtu-redirect-rewrite exits with 0 if it actually, and sucessfully,
processed input to output. Exit code 1 indicates that nothing was to
be done because the output file was newer than the input file and
--update was in effect. You can use this from a shell script to
determine whether Apache needs to be reloaded.

All higher exit states indicate errors.

=head1 BUGS

The original input file that drr was tested with contained some
erroneous redirects that the script silently ignores. One of those is
where dtu.dk/subsite was used as a source, whilst no dtu.dk URLs ever
reach the redirector.

If the input contains both the sources host/site and
host/site/subsite, whichever URL comes first will be caught.

Your CGI query strings are minced meat.

=head1 AUTHOR

Roel de Cock <roel@dtu.dk>

=cut
