/* sad.cpp, Matt Mahoney, mmahoney@cs.fit.edu

Simple anomaly detector, takes full advantage of simulation artifacts
in the 1999 DARPA IDS evaluation data set.

Copyright (C) 2003, Matt Mahoney.  This program is distributed
without warranty under terms of the GNU general public license.
See http://www.gnu.org/licenses/gpl.txt

Usage: sad 38 tcpdump_files... | perl afil.pl > sad.sim
       eval3 sad.sim
       eval4 s=sad.sim
       sad 45 in3tf in45tf | eval -

The first argument is the byte offset in the network packet, including
the 16 byte tcpdump header and 14 byte Ethernet header.  38 is the
TTL field (byte 8 of the IP header).

SAD examines 1 byte of inbound TCP SYN packets.  During training
(hard coded to weeks 1 or 3) it remembers which of the 256 possible
values occurred.  During testing (weeks 2, 4, or 5) it generates an
alarm if the value was never observed in training and no other
anomalies occurred in the last 60 seconds.  The score is t * 1e-6
where t is the time in seconds since the last anomaly.
Some good values on in3tf, in45tf (weeks 3-5 filtered with tf) are:

Byte                    Det/FA out of 177 inside sniffer attacks
----                    ------
33 IP length low byte    15/2
38 TTL                   22/4
42 Src IP addr byte 1    64/41
43 byte 2                67/42
44 byte 3                79/43
45 byte 4                71/16
50 source port hi byte   13/0
62 TCP header size       15/0
64 window size hi byte   15/0
65 window size lo byte    7/0
70 TCP options 1st byte  15/2

Unfiltered tcpdump files should give similar results, since tf doesn't
remove most inbound TCP SYN packets.  However this would increase
run time from 1 sec. to 15 min. so I didn't test them all.
*/


#include <cstdio>
#include <cstdlib>
#include <cctype>
#include <ctime>
#include <cmath>
using namespace std;

// Convert 2 or 4 bytes to int, MSB first
int i2(const unsigned char* p) {
  return (p[0]<<8)|p[1];
} 

unsigned long i4(const unsigned char* p) {
  return (((((p[0]<<8)|p[1])<<8)|p[2])<<8)|p[3];
} 

// Return the time (seconds since 1970 UT) in a readable format
// Convert 1/1/70-4/3/99 to EST
// Convert 4/4/99-12/31/99 to EDT
const char* print_time(double seconds) {
  static char s[30];
  time_t t=time_t(seconds);
  if (t<10957*86400) t-=4*3600;  // EDT before 1/1/2000
  if (t<10685*86400) t-=3600;  // EST before 4/4/1999
  tm* local=localtime(&t);
  if (local)
    strftime(s, 30, "%m/%d/%Y %H:%M:%S", local);
  else
    s[0]=0;
  return s;
}

/* PacketReader - a class for reading tcpdump packets from a file.

PacketReader pr(int argc, char** argv);

  Prepares pr to read packets from a list of files named in argv[0..argc-1].
  Files must be tcpdump files.

const unsigned char* pr.read()

  Reads one packet and returns it, or 0 at end of last file.
  The first call reads the first packet from argv[0].  At the end of
  each file, the file is closed and read() returns the first packet from
  the next file.  The length is i4(pr.read()+12) bytes.
*/

class PacketReader {
private:
  enum {MAX_PACKET=1600};  // Max packet size including tcpdump header
  unsigned char* buf;  // Current input packet, MAX_PACKET bytes
  int argc;  // Number of files remaining to be read
  const char* const* argv;  // Names of files remaining to be read
  FILE* f;  // Currently open file, or 0 if all are closed
  void close(const char* msg=0);  // Close file, print msg if any
public:
  PacketReader(int ac, const char* const* av):
    buf(new unsigned char[MAX_PACKET]), argc(ac), argv(av), f(0) {}
  ~PacketReader() {delete[] buf;}
  const unsigned char* read();
};

// Close f and go to next file.  If msg is not 0, print error message
void PacketReader::close(const char* msg) {
  if (f) {
    fclose(f);
    f=0;
  }
  if (msg)
    fprintf(stderr, "%s: %s\n", argv[0], msg);
  --argc;
  ++argv;
}

// Read a packet and return its timestamp, or 0 at EOF
const unsigned char* PacketReader::read() {
  while (true) {
    if (f) {
      if (fread(buf, 1, 16, f)!=16)
        close("OK");  // EOF
      else {
        unsigned long len1=i4(buf+8);  // Recorded length <= len2
        unsigned long len2=i4(buf+12); // Original length <= MAX_PACKET-16
        if (len1>len2 || len2>MAX_PACKET-16)
          close("bad tcpdump packet header");
        else if (fread(buf+16, 1, len1, f)!=len1)
          close("truncated packet");
        else
          return buf;
      }
    }
    else {  // Open file
      if (argc<1)
        return 0;  // No file to open
      fprintf(stderr, "%s\n", argv[0]);
      f=fopen(argv[0], "rb");
      if (!f)
        close("file not found");
      else if (fread(buf, 1, 24, f)!=24)
        close("file is too small");
      else if (i4(buf)!=0xa1b2c3d4)
        close("not in tcpdump format");
    }
  }
}

// A sad case of anomaly detection
int main(int argc, const char* const* argv) {

  if (argc<3) {
    fprintf(stderr,
      "SAD v1 (C) 2003, Matt Mahoney, mmahoney@cs.fit.edu\n"
      "Distributed without warranty under terms of the GNU general public\n"
      "license, see http://www.gnu.org/licenses/gpl.txt\n"
      "\n"
      "Usage: sad 38 in[3-5]* | perl afil.pl >sad.sim\n"
      "       eval3 sad.sim\n"
      "       eval4 s=sad.sim\n"
      "\n"
      "SAD (Simple Anomaly Detector, or Simulation Artifact Detector :)\n"
      "demonstrates anomaly detection in the 1999 DARPA IDS evaluation data set.\n"
      "38 (TTL) is the packet byte offset including the 16 byte tcpdump\n"
      "header and 14 byte Ethernet header.  in1* and in3* are training\n"
      "tcpdump files.  in2* in4* in5* are test files (inside.tcpdump).\n"
      "sad, afil.pl, eval3, and eval4 are available at\n"
      "http://cs.fit.edu/~mmahoney/dist/\n");
    return 1;
  }

  // Anomaly model
  const int attr=atoi(argv[1]);  // Attribute number
  static bool val[256]; // true if seen in training
  unsigned long last_anomaly=0, now=0; // Seconds since 1970

  // These are just for printing statistics when finished
  unsigned long start_time=0;  // Time of first packet
  int trains=0, tests=0, anomalies=0;  // Counts
  int r=0;  // Number of allowed values

  PacketReader pr(argc-2, argv+2);  // Read tcpdump files
  const unsigned char* pkt=0;
  while ((pkt=pr.read())!=0) {
    int len=i4(pkt+8);  // Packet length (truncated) without tcpdump header

    // TCP SYN to 172.16.x.x or 163.118.135.1 (www.cs.fit.edu) ?
    if (len>=54 && i2(pkt+28)==0x800 && pkt[39]==6 && pkt[63]==2 &&
        attr<len+16 &&
        (pkt[46]==172 && pkt[47]==16 ||
         pkt[46]==163 && pkt[47]==118 && pkt[48]==135 && pkt[49]==1)) {
      now=i4(pkt);  // Time of current packet in seconds since 1970 UT
      if (!start_time)
        start_time=now;
      const bool test2=now>10657*86400 && now<10664*86400;  // Week 2?
      const bool test45=now>10678*86400 && now<10692*86400; // Week 4 or 5?
      const bool train=!test2 && !test45;

      // Train
      if (train) {
        if (++trains==1)  // Print start time of first packet
          fprintf(stderr, "Training starts at %s\n", print_time(now));
        if (!val[pkt[attr]]) {
          val[pkt[attr]]=true;
          last_anomaly=now;
          ++r;
        }
      }

      // Test
      else {
        if (++tests==1) {  // Print training stats at first test packet
          if (now>start_time)
            fprintf(stderr, "Last learned value at %s (%f%% of training)\n",
              print_time(last_anomaly),
              100.0*(last_anomaly-start_time)/(now-start_time));
          fprintf(stderr, "Testing starts at %s\n", print_time(now));
        }
        if (!val[pkt[attr]]) {
          double score=0.000001*(now-last_anomaly);
          last_anomaly=now;
          if (score>=0.00006) {  // At least 1 minute since last anomaly?
            printf("       0 %s %03d.%03d.%03d.%03d %8.6f # %3d\n",
                print_time(now), pkt[46], pkt[47], pkt[48], pkt[49], score,
                pkt[attr]);
            ++anomalies;
          }
        }
      }
    }
  }

  // Print the model and statistics
  fprintf(stderr, "%d allowed values for attribute %d:", r, attr);
  if (r<256) {
    for (int i=0; i<256; ++i)
      if (val[i])
        fprintf(stderr, " %d", i);
  }
  fprintf(stderr,
    "\n%d anomalies in %d training and %d test packets\nfrom %s",
    anomalies, trains, tests, print_time(start_time));
  fprintf(stderr, " to %s\n", print_time(now));
  return 0;
}
