#!/usr/bin/perl -w

# wpoison-gt.pl - version 0.1
# Based on wpoison.pl version 1.8p
#
# For usage instructions see http://www.monkeys.com/wpoison/
# This version is a modification of the original script, and is
# distributed at http://www.gloomytrousers.co.uk/open_source/
#
# Original idea by Ronald F. Guilmette <rfg@monkeys.com>
# Code implemented by Ronald F. Guilmette <rfg@monkeys.com>
# Assorted improvements by Russell Odom <russ@gloomytrousers.co.uk>
#
# Copyright (c) 2000,2001 Ronald F. Guilmette; All rights reserved.
# Copyright (c) 2018 Russell Odom; no rights reserved!
#
# Redistribution and use in source form ONLY, with or without modification,
# is permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
# 2. All advertising materials mentioning features or use of this software
#     must display the following acknowledgement:
#      This product includes software developed by Ronald F. Guilmette.
# 3. Neither the name of Ronald F. Guilmette nor the names of other con-
#     tributors to this software may be used to endorse or promote products
#     derived from this software without specific prior written permission.
# 4. Either a copy of, or a link to the official Wpoison logo graphic (which
#     may be found at http://www.monkeys.com/wpoison/logo.gif) must be
#     included in, and clearly and legibly visible on the home page of any
#     web site which uses, employs, includes, or makes reference to this
#     software or any derivative or modified version thereof.  Also, the
#     official Wpoison logo itself must be include in an HTML hyperlink
#     so that any usser clicking on any part of the logo image will be
#     directed/linked to the Wpoison home page at:
#
#	http://www.monkeys.com/wpoison/
#
#     In order to satisfy this requirement, you may simply include the
#     following HTML code somewhere (anywhere) on your web site home page:
#
#	<A HREF="http://www.monkeys.com/wpoison/">
#		<IMG SRC="http://www.monkeys.com/wpoison/logo.gif">
#	</A>
#
# Disclaimer
# ----------
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.

use strict;
use FileHandle;
use POSIX;
use vars qw($pname $tmp_words_file $num_cached_randwords @randwords @tl_domains_1 @tl_domains_2 @html_tags @headers @cssrules @chunk);

$pname="wpoison";
$tmp_words_file = "/tmp/wpoison.words";
$num_cached_randwords = 4096;
my $creation_time;
my $age_in_seconds;
my $email_addr;
my $num_addresses;
my $num_links;
my $script_name = getenv ("SCRIPT_NAME") || $0;

@tl_domains_1 = (
  "com", "com", "com", "com",
  "net", "net", "net",
  "org", "org",
  "edu", "edu",
  "gov",
  "mil",
  "int");

@tl_domains_2 = (
  "co.uk", "net.uk", "org.uk", "gov.uk", "su",
  "af", "al", "dz", "as", "ad", "ao", "ai", "aq", "ag", "ar", "am", "aw", "au",
  "at", "az", "bs", "bh", "bd", "bb", "by", "be", "bz", "bj", "bm", "bt", "bo",
  "ba", "bw", "bv", "br", "io", "bn", "bg", "bf", "bi", "kh", "cm", "ca", "cv",
  "ky", "cf", "td", "cl", "cn", "cx", "cc", "co", "km", "cg", "ck", "cr", "ci",
  "hr", "cu", "cy", "cz", "dk", "dj", "dm", "do", "tp", "ec", "eg", "sv", "gq",
  "er", "ee", "et", "fk", "fo", "fj", "fi", "fr", "fx", "gf", "pf", "tf", "ga",
  "gm", "ge", "de", "gh", "gi", "gr", "gl", "gd", "gp", "gu", "gt", "gn", "gw",
  "gy", "ht", "hm", "hn", "hk", "hu", "is", "in", "id", "ir", "iq", "ie", "il",
  "it", "jm", "jp", "jo", "kz", "ke", "ki", "kp", "kr", "kw", "kg", "la", "lv",
  "lb", "ls", "lr", "ly", "li", "lt", "lu", "mo", "mk", "mg", "mw", "my", "mv",
  "ml", "mt", "mh", "mq", "mr", "mu", "yt", "mx", "fm", "md", "mc", "mn", "ms",
  "ma", "mz", "mm", "na", "nr", "np", "nl", "an", "nc", "nz", "ni", "ne", "ng",
  "nu", "nf", "mp", "no", "om", "pk", "pw", "pa", "pg", "py", "pe", "ph", "pn",
  "pl", "pt", "pr", "qa", "re", "ro", "ru", "rw", "kn", "lc", "vc", "ws", "sm",
  "st", "sa", "sn", "sc", "sl", "sg", "sk", "si", "sb", "so", "za", "gs", "es",
  "lk", "sh", "pm", "sd", "sr", "sj", "sz", "se", "ch", "sy", "tw", "tj", "tz",
  "th", "tg", "tk", "to", "tt", "tn", "tr", "tm", "tc", "tv", "ug", "ua", "ae",
  "gb", "us", "um", "uy", "uz", "vu", "va", "ve", "vn", "vg", "vi", "wf", "eh",
  "ye", "yu", "zr", "zm", "zw");

my @spamtrap_domains = ("9imail.com", "allfloydians.com",
  "americanleaseline.com", "asiafreeport.com", "firststreetinternet.com",
  "gatekeep.net", "gmailremovethis.com", "hotmhot.com",
  "members.offshorexeecutive.com", "opticlick.net", "prud.net", "quilch.net",
  "ramada-oakville.com", "removethishotmail.com", "telesouthgroup.com");

@html_tags = (
	"<h1>|</h1>\n",
	"<h2>|</h2>\n",
	"<h3>|</h3>\n",
	"<b>|</b>",
	"<i>|</i>",
	"<u>|</u>",
	"<p style=\"color: " . gen_random_color() . "\">|</p>\n",
	"<p style=\"background-color: " . gen_random_color() . "\">|</p>\n",
	"<p>|</p>\n",
	"<p>|</p>\n",
	"<p>|</p>\n",
	"<p>|</p>\n",
	"<p>|</p>\n",
	"<p>|</p>\n",
	"<ul><li>|</li></ul>\n",
);


sub
death
{
	my ($message) = @_;

	print STDOUT<<END;
Content-Type: text/html

<html>
<head><title>wpoison-gt - Unexpected Error</title></head>
<body>
$pname: $message
</body>
</html>
END
  exit 0;
}

sub
gen_new_random_words_list
{
  my $dictfile;
  my $total_words;
  my @words;
  my %already_taken;

  if (-f $tmp_words_file) {
    if (not unlink ($tmp_words_file)) {
      death ("Error unlinking file \`$tmp_words_file\': $!");
    }
  }

  if ( -f "/usr/dict/words") {
    $dictfile = "/usr/dict/words";
  } elsif (-f "/usr/share/dict/words") {
    $dictfile = "/usr/share/dict/words"
  } elsif (-f "words") {
    $dictfile = "words";
  } else {
    death ("Cannot find dictionary file!");
  }

  death ("Error opening dictionary file \`$dictfile\': $!")
    unless (open (DICTFILE, "<$dictfile"));

  $total_words = 0;
  while (<DICTFILE>) {
    chop;
    push @words, $_;
    $total_words++;
  }

  close DICTFILE;

  death ("Error opening tmp words file \`$tmp_words_file\': $!")
    unless (open (RWORDS, ">$tmp_words_file"));

  # We will now pick $num_cached_randwords words at random
  for (1..$num_cached_randwords) {
  try_again:
    my $rand_index = int (rand $total_words);

    goto try_again if (defined $already_taken{$rand_index});
    $already_taken{$rand_index} = 1;
    print RWORDS "$words[$rand_index]\n";
  }

  close RWORDS;
}

sub
read_random_words
{
  death ("Error opening tmp words file \`$tmp_words_file\': $!")
    unless (open (RWORDS, "<$tmp_words_file"));

  chop(@randwords = <RWORDS>);

  close RWORDS;
}

sub
random_word
{
  my $word_index;

  $word_index = int (rand $num_cached_randwords);
  return $randwords[$word_index];
}

sub
gen_random_words
{
  my ($min_words, $max_words) = @_;
  my $num_words;
  my $word_index;
  my $i;
  my @ret = ();

  $num_words = $min_words + (rand ($max_words - $min_words));
  for $i (1..$num_words) {
    $word_index = int (rand $num_cached_randwords);
    push @ret, $randwords[$word_index];
  }
  return @ret;
}

sub
gen_random_color
{
  my $red_code = int (rand 256);
  my $green_code = int (rand 256);
  my $blue_code = int (rand 256);

  return sprintf "#%02x%02x%02x", $red_code, $green_code, $blue_code;
}

sub
random_letter
{
  return chr (unpack ("%c", 'a') + int (rand 26));
}

sub
random_domain
{
  my $rindex;

  if (int (rand 4) == 0) {
    $rindex = int (rand ($#tl_domains_2 + 1));
    return $tl_domains_2[$rindex];
  } else {
    $rindex = int (rand ($#tl_domains_1 + 1));
    return $tl_domains_1[$rindex];
  }
}

# Seed random number generator
srand;

# Build the random words list if we need to (if it doesn't exist, or is too old)
if (not -r $tmp_words_file) {
  gen_new_random_words_list ();
} else {
  $creation_time = (stat $tmp_words_file)[9];
  $age_in_seconds = time - $creation_time;
  gen_new_random_words_list () if ($age_in_seconds > (30 * 60));
}

read_random_words ();

# Generate some HTML headers
# We generate a varying number of headers and output them in a random order.
# This is intended to make it harder for crawler writers to develop a "signature" for recognising wpoison scripts, because the way the response is made up changes on every request.

# Firstly, an essential header, so we don't inconvenience legitimate crawlers (which respect these headers)
@headers = ('<meta name="robots" content="noindex, nofollow" />');

# Next, non-essential headers, which we sometimes add and sometimes don't
push @headers, '<title>' . join(' ', gen_random_words(1, 5)) . '</title>' if (int(rand(10)) != 0);
push @headers, '<meta name="description" content="' . join (' ', gen_random_words(7, 20)) . '" />' if (int(rand(1)) == 0);
push @headers, '<meta name="keywords" content="' . join (', ', gen_random_words(7, 20)) . '" />' if (int(rand(1)) == 0);
# TODO: More could go here

# Some formatting randomness. We're nothing if not thorough :-)
my $bracestyle = (int(rand(1))==0?" ":"\n");
my $indent = substr "\t\t\t", 0, int(rand(3));

# Maybe add some CSS rules
if (int(rand(2)) == 0) {
	# Generate some random real-looking CSS rules
	my @cssentities = ('body', 'a', 'a:visited', 'a:hover', '.'.random_word());
	foreach my $tag (@html_tags) {
		if ($tag =~ /^<([\.:\w\d#]+)/so) {
			push @cssentities, $1;
		}
	}
	my @cssrules = ();
	foreach my $tag (@cssentities) {
		if (int(rand(3)) == 0) {
			# TODO: Randomise the order of these too
			my @thiscss = ();
			push @thiscss, "color: " . gen_random_color() . ";" if (int(rand(2)) == 0);
			push @thiscss,  "background-color: " . gen_random_color() . ";" if (int(rand(2)) == 0);
			push @thiscss,  "size: " . (int(rand(10)) + 6) . "px;" if (int(rand(2)) == 0);
			# TODO: More rules here
			push @cssrules, "$tag${bracestyle}{\n\t" . join("\n\t", @thiscss) . "\n}\n" if (@thiscss);
		}
	}
	# Generate a <style> block with the rules in a random order
	my $cssheader = "<style type=\"text/css\">\n";
	while (@cssrules) {
		$cssheader .= splice(@cssrules, int(rand($#cssrules + 1)), 1) . "\n" if (int(rand(4) <= 2));
	}
	$cssheader .= "</style>";
	push @headers, $cssheader;
}

sub
spamtrap_domain
{ 
	my $rindex;
	$rindex = rand ($#spamtrap_domains + 1);
	return $spamtrap_domains[$rindex];
}

# Output the HTTP and HTML headers
autoflush STDOUT 1;
print STDOUT<<END;
Content-Type: text/html
Cache-control: no-cache

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
END
# Output header tags in a random order
while (@headers) {
	my $thisheader = splice(@headers, int(rand($#headers + 1)), 1) . "\n";
	$thisheader =~ s/^/$indent/gmo;
	print STDOUT $thisheader;
}
print STDOUT<<END;
</head>
<body>
END

# Sleep for between 2 and 6 seconds BEFORE we've output any links that might be followed, to avoid server overload even from crawlers that don't wait for the current page to finish before following the links we give them
sleep (2 + rand(4));

# Generate the chunks to output:
# 1) Some random words
my @outputitems = gen_random_words (50, 200);

# 2) Some e-mail addresses
$num_addresses = 2 + int (rand 16);
for (1..$num_addresses) {
	$email_addr = random_word ();
	$email_addr .= "@";
	if (int (rand 8) == 0) {
		$email_addr .= spamtrap_domain ();
	} else {
		if (int (rand 4) == 0) {
			$email_addr .= random_word () . ".";
		}
		$email_addr .= random_word () . random_letter () . "." . random_domain ();
	}
	push @outputitems, "<a href=\"mailto:$email_addr\">$email_addr</a>";
}

# 3) Some links to more pages
$num_links = 1 + int (rand 16);
for (1..$num_links) {
	push @outputitems, "<a href=\"$script_name/" . join('-', gen_random_words (1, 2)) .'">' . join(' ', gen_random_words (1, 4)) . '</a>';
}

# Output the items in chunks, each with a HTML tag wrapped around, and some random indenting and newlines (this is more "signature" avoidance)
while (@outputitems) {
	my @tag = split /\|/, $html_tags[int(rand $#html_tags+1)];
	my $itemcount = int(rand 30) + 1;
	@chunk = ();
	while ($itemcount-- && @outputitems) {
		push @chunk, splice(@outputitems, int(rand($#outputitems + 1)), 1) . ((int(rand(2)) == 0) ? substr('.!,?-', int(rand(5)), 1) : '');
	}
	print STDOUT substr("\t\t\t", 0, int(rand(3))) . $tag[0];
	print STDOUT join(' ', @chunk);
	print STDOUT $tag[1] . substr("\n\n\n", 0, int(rand(3)));

	# Sleep between chunks sometimes, to further avoid server overload and slow down the crawlers
	# (if they're busy crawling us, they can't be crawling anyone else!)
	sleep (rand(2)) if (int(rand(4)) == 0);
}

# The HTML footer
print STDOUT<<END;
</body>
</html>
END
#system("set > /tmp/tmp");
exit 0;
