/* lerad.cpp - Anomaly rule learning algorithm and detector

Copyright (C) 2002, Matt Mahoney.  This program is distributed
without warranty under terms of the GNU general public license.
See http://www.gnu.org/licenses/gpl.txt

Usage: lerad train.txt test.txt [seed]

This program performs anomaly detection on the 1999 DARPA IDS data set.
It reads input from 2 files, train.txt (attack-free data) and test.txt
(data containing attacks).  The input is free form (words separated
by white space), consisting of: a number of attributes (m), m attribute
names, and tuples containing m attributes.  The first 4 attributes must
be a date (in form mm/dd/yyyy), time (in form hh:mm:ss), and the 2 low
bytes of the IP address of the target host (as 3 decimal digits with
leading 0s).  For instance:

23
Date Time DA1 DA0 DP SA3 SA2 SA1 SA0 SP DUR F1 F2 F3 LEN W1 W2 W3 W4 W5 W6 W7 W8
03/15/1999 08:00:18 113 105 25 196 037 037 158 1024 0 .S .AP .AF 1263 .^@EHLO
03/15/1999 08:00:35 112 050 23 172 016 016 204 1024 45 .S .AP .AF 389 .^@ .^C
03/15/1999 08:00:49 113 084 25 197 182 182 233 1026 0 .S .AP .AF 2165 .^@EHLO

The output is a list of alarms in the format expected by EVAL3 and
EVAL4, with fixed width fields as follows:

       0 MM/DD/YYYY HH:MM:SS 172.016.DA1.DA0 S.SSSSSS #Comments

where S.SSSSSS is a score from 0 to 9.999999.

LERAD learns rules from train.txt.  For instance, given the tuples

A B C D
1 2 3 4
1 2 3 5

It might learn the following rules (output to rules.txt)

1 n=2 if then B=2
2 n=2 if A=1 then B=2
3 n=2 if A=1 C=3 then B=2

where n is the number of tuples satisfying the anteceedent.
Then given the tuple,

A B C D
1 3 4 5

then the anteceedent of the first 2 rules are satisfied, and they
are updated:

1 n=3 if then B=2 3
2 n=3 if A=1 then B=2 3
3 n=2 if A=1 C=3 then B=2 (unchanged)

If a tuple in test.txt satisfies the anteceedent but the consequent
is not in the list, then an alarm is generated with an anomaly score
proportional to tn/r, where t is the time (in number of tuples) since
the last anomaly and r is the number of allowed values in the consequent
(r=2 for rules 1 and 2, r=1 for rule 3).  For instance, if the tuple
below occurs 100 tuples later,

A B C D
1 8 3 0

then since it violates rule 3, it generates an anomlay score of
100 * 2 / 1 = 200.

The optional seed is for random number generation.  If omitted,
it defaults to 0 to produce repeatable results.

*/

#include <iostream>
#include <fstream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <ctime>
using namespace std;

const double LOG10=log(10);

int m=0;  // Global number of attributes

typedef int Nominal;  // Index of attribute value
typedef map<Nominal, int> Val;  // Set of allowed values
typedef vector<Nominal> Anteceedent;  // Conditions: 0=don't care, 1=cons.

struct Consequent {  // Consequent and counts of a rule
  int n;  // Support
  int t;  // Time of last anomaly
  Val val;  // Allowed values
  Consequent(): n(0), t(0) {}
  Consequent(Nominal v): n(0), t(0) {val[v]=1;}
  int add(Nominal v) {++n; return ++val[v];}
};

// A ruleset maps anteceedednts to consequents.
// The anteceedent is m elements where 0 = *, 1 = ?, 2 or more is
// the constrained value for the attribute.
typedef map<Anteceedent, Consequent> Ruleset;

struct Rule: public Consequent {  // Complete rule
  Anteceedent rule;  // 0=* 1=? >1 = constraint
  int cons;  // index of ? in rule
  vector<int> ante;  // Indexes of anteceedents >1
  Rule(): cons(0) {}
  Rule(const Anteceedent& v, const Consequent& c): Consequent(c), rule(v) {
    cons=rule.size();
    for (int i=0; i<int(rule.size()); ++i) {
      if (rule[i]==1)
        cons=i;
      else if (rule[i]>1)
        ante.push_back(i);
    }
  }
  bool match(const Anteceedent& a) const {  // Rule applies to a?
    for (int i=0; i<int(ante.size()); ++i)
      if (rule[ante[i]]!=a[ante[i]])
        return false;
    return true;
  }
};

// For sorting rules by max n/r, and by min anteceedents if equal
bool operator < (const Rule& a, const Rule& b) {
  const int nr1=a.n*b.val.size();
  const int nr2=b.n*a.val.size();
  return nr1>nr2 || (nr1==nr2 && a.ante.size()<b.ante.size());
}

// 30-31 bit random number
inline int rnd() {return (rand()+(rand()<<15))&0x7fffffff;}

int main(int argc, char** argv) {

  // Test args
  if (argc<3) {
    cerr << "Usage: " << argv[0] << " train test [seed]\n";
    return 1;
  }

  // Init random numbers
  srand(argc>3 ? atoi(argv[3]) : 0);

  // Translate Nominal to string, string to Nominal
  vector<string> n2s;
  n2s.push_back("*");
  n2s.push_back("?");
  map<string, Nominal> s2n;
  s2n["*"]=0;
  s2n["?"]=1;
  
  // Read relation header
  cerr << "Reading train.txt\n";
  ifstream in(argv[1]);
  if (!in) {
    cerr << argv[1] << " not found\n";
    return 1;
  }
  in >> m;
  if (m<2) {
    cerr << "Only " << m << " attributes\n";
    return 1;
  }
  vector<string> attribute_names(m);
  string s;
  for (int i=0; i<m; ++i) {
    in >> s;
    attribute_names[i]=s;
  }

  // Read relation from train.txt
  vector<Anteceedent> relation;  // [tuple][attribute]
  Anteceedent t(m);
  while (true) {
    int i;
    for (i=0; i<m && in>>s; ++i) {
      Nominal n=s2n[s];
      if (n==0) {
        n=n2s.size();
        n2s.push_back(s);
        s2n[s]=n;
      }
      t[i]=n;
    }
    if (i==m)
      relation.push_back(t);
    else
      break;
  }
  if (relation.size()<2) {
    cerr << "Only " << relation.size() << " tuples\n";
    return 1;
  }
  in.close();

  // Select random samples from the relation for preliminary training
  vector<int> samples;  // Saved ascending random indexes of relation
  int r=min(int(relation.size()), 100);  // Number of samples still needed
  for (int i=0; r>0 && i<int(relation.size()); ++i) {
    int rn=rnd()%(int(relation.size())-i);
    if (rn<r) {
      --r;
      samples.push_back(i);
    }
  }

  // Construct a ruleset.
  // The key is a rule, coded 0="*", 1="?", 2 or more = constrained
  // attribute.  Rules are constructed by sampling 2 tuples
  // in the relation and constructing up to 4 rules for 4 randomly
  // selected matching attributes in random order where the first is "?"
  // and the other 3 are the anteceedents.

  cerr << relation.size() << " tuples.  Constructing ruleset\n";
  Ruleset ruleset;
  Anteceedent rule(m);
  for (int i=0; i<1000; ++i) {  // Number of tuple pairs
/*
    int r1=rnd()%relation.size();  // Pick 2 random tuples
    int r2=rnd()%(relation.size()-1);
    if (r1==r2)
      r2=relation.size()-1;  // so r1 != r2
*/
    // Pick random pair of tuples from the sample set
    int r1=rnd()%samples.size();
    int r2=rnd()%(samples.size()-1);
    if (r1==r2)
      r2=samples.size()-1;
    r1=samples[r1];
    r2=samples[r2];

    // Generate rules by matching attribute values
    for (int j=0; j<m; ++j)
      rule[j]=0;
    int count=0;
    int result=0;
    for (int j=0; j<m && count<4; ++j) {
      int r3=rnd()%m;  // Pick random attribute
      if (relation[r1][r3]==relation[r2][r3]) {  // First match is ?
        if (count==0) {
          result=relation[r1][r3];
          rule[r3]=1;
          count=1;
          ruleset[rule];
        }
        else if (rule[r3]==0) {  // Other matches are anteceedents
          rule[r3]=relation[r1][r3];
          ++count;
          ruleset[rule]=result;
        }
      }
    }
  }

  // Estimate ruleset support by sampling
  cerr << ruleset.size() << " rules.  Estimating support\n";
  vector<Rule> rules;  // Sorted ruleset
  for (Ruleset::iterator p=ruleset.begin(); p!=ruleset.end(); ++p) {
    int k;
    for (k=0; k<m; ++k)  // Find "?"
      if (p->first[k]==1)
        break;
    if (k==m) {
      cerr << "oops, rule with missing ?\n";
      return 1;
    }
    int i;
    for (i=0; i<int(samples.size()); ++i) {
      int r=samples[i];
      int j;
      for (j=0; j<m; ++j)
        if (p->first[j]>=2 && p->first[j]!=relation[r][j])
          break;
      if (j==m) // Rule applies?
        p->second.add(relation[r][k]);
    }
    rules.push_back(Rule(p->first, p->second));
  }

  // Estimate n/r again so that each tuple attribute is predicted by
  // only the rule with the highest n/r from before
  cerr << "Removing duplicate rules from " << rules.size() << " rules\n";
  sort(rules.begin(), rules.end());
  vector<vector<bool> > cover(samples.size());  // cover[i][j] is true
    // if there is a rule predicting attribute relation[sample[i]][j]
  for (int i=0; i<int(cover.size()); ++i)
    cover[i].resize(m);
  for (int i=0; i<int(rules.size()); ++i) {
    Rule& r=rules[i];
    int cons=r.cons;
    r.val.clear();
    r.n=0;
    for (int j=0; j<int(samples.size()); ++j) {
      if (!cover[j][cons] && r.match(relation[samples[j]])) {
        r.add(relation[samples[j]][cons]);
        cover[j][cons]=true;
      }
    }
  }

  // Discard unsupported rules
  for (int i=0; i<int(rules.size()); ++i) {
    if (rules[i].n==0) {
      rules[i]=rules.back();
      rules.pop_back();
      --i;
    }
  }

  // Calculate exact support for top rules on entire training set
  cerr << "Calculating exact coverage for " << rules.size() << " rules\n";
  sort(rules.begin(), rules.end());
  cover.clear();
  cover.resize(relation.size());
  for (int i=0; i<int(cover.size()); ++i)
    cover[i].resize(m);
  int now=0;  // Time
  for (int i=0; i<int(rules.size()); ++i) {
    int cons=rules[i].cons;
    rules[i].n=0;
    rules[i].val.clear();
    now=0;
    for (int j=0; j<int(relation.size()); ++j) {
      ++now;
      if (rules[i].match(relation[j])) {
        cover[j][cons]=true;
        if (rules[i].add(relation[j][cons])==1) {
          rules[i].t=now;
          if (j*10 > int(relation.size()*9)) { // late anomaly, remove rule
            rules[i]=rules.back();
            rules.pop_back();
            --i;
            break;
          }
        }
      }
    }
  }

  // Print coverage
  cerr << "Coverage:";
  for (int i=0; i<m; ++i) {
    int count=0;
    for (int j=0; j<int(cover.size()); ++j)
      if (cover[j][i])
        ++count;
    cerr << attribute_names[i] << "=" << count << " ";
  }
  cerr << "\n";

  // Output rules.txt
  sort(rules.begin(), rules.end());
  ofstream out("rules.txt");
  if (out) {
    for (int i=0; i<int(rules.size()); ++i) {
      out << i+1 << " " << rules[i].n << "/" << rules[i].val.size() << " if";
      for (int j=0; j<m; ++j) {
        if (rules[i].rule[j]>1)
          out << " " << attribute_names[j] << "=" << n2s[rules[i].rule[j]];
      }
      out << " then " << attribute_names[rules[i].cons] << " =";
      for (Val::iterator q=rules[i].val.begin(); q!=rules[i].val.end(); ++q)
        out << " " << n2s[q->first];
      out << "\n";
    }
  }

  // Statistics
  int nsum=0, nrsum=0;
  for (int i=0; i<int(rules.size()); ++i) {
    nsum+=rules[i].n;
    nrsum+=rules[i].n*rules[i].val.size();
    if (i==0 || i==9 || i==99 || i==999 || i==9999 || i==99999
        || i==int(rules.size())-1) {
      cerr << "Rules: " << i+1 << " Coverage: " << nsum << " "
           << (double(nsum)/relation.size()/m)
           << " Diversity: " << double(nrsum)/nsum << "\n";
    }
  }
  cerr << s2n.size() << " strings in training data\n";

  // Score anomalies in test.txt
  cerr << "Testing " << argv[2] << " with " << rules.size() << " rules\n";
  in.open(argv[2]);
  if (!in) {
    cerr << argv[2] << " not found\n";
    return 1;
  }
  int m1=0;
  in >> m1;
  if (m1!=m) {
    cerr << "train.txt width=" << m << " test.txt width=" << m1 << endl;
    return 1;
  }
  for (int i=0; i<m; ++i) {  // Skip first line
    string s;
    in >> s;
  }
  vector<int> ta(rules.size());  // Time of last anomaly per rule
  while (in) {

    // Read tuple t
    ++now;
    int i;
    for (i=0; i<m; ++i) {
      in >> s;
      Nominal n=s2n[s];
      if (n==0) {
        n=n2s.size();
        n2s.push_back(s);
        s2n[s]=n;
      }
      t[i]=n;
    }
    if (!in)
      break;

    // Evaluate tuple t
    double score=0;  // Total score of tuple
    double bs=0;  // Highest scoring rule
    int brule=0;  // Index of highest scoring rule

    for (int i=0; i<int(rules.size()); ++i) {
      double score1=0;  // score for this rule
      const Rule& r=rules[i];
      if (r.match(t) && r.val.find(t[r.cons])==r.val.end() && r.val.size()>0){
        score1=double(now-rules[i].t)*rules[i].n/rules[i].val.size();
        if (score1>bs) {
          brule=i;
          bs=score1;
        }
        score+=score1;
        rules[i].t=now;
      }
    }

    // Print anomaly
    double pct=0;
    if (score>0) {
      pct=100*bs/score;
      score=log(score)/LOG10-4.5;
    }
    if (score>0) {
      cout << "       0 " << n2s[t[0]] << " " << n2s[t[1]] << " 172.016."
           << n2s[t[2]] << "." << n2s[t[3]] << " " << int(score)
           << "." << setw(6) << setfill('0')
           << int(score*1000000)-int(score)*1000000
           << " # " << setw(3) << brule+1 << " ("
           << setprecision(4) << pct << ")";
      for (int i=0; i<m; ++i) {
        if (rules[brule].rule[i]>=1) {
          cout << " " << attribute_names[i];
          if (rules[brule].rule[i]==1)
            cout << "?";
          cout << "=" << n2s[t[i]];
        }
      }
      cout << "\n";
    }
  }
  cerr << s2n.size() << " strings in all data\n";
  return 0;
}

