/* fv.cpp - File entropy visualization program Copyright (C) 2006, Matt Mahoney. This program is distributed without warranty under terms of the GNU general public license. See http://www.gnu.org/licenses/gpl.txt To compile: g++ fv.cpp Usage: fv file (Requires 512 MB memory) The output is fv.bmp with the given size in pixels, which visually displays where matching substrings of various lengths and offests are found. A pixel at x, y is (black, red, green, blue) if the last matching substring of length (1, 2, 4, 8) at x occured y bytes ago. x and y are scaled so that the image dimensions match the file length. The y axis is scaled log base 10. The maximum range of a match is 1 GB. */ #define NDEBUG // remove for debugging #include #include #include #include #include #include #include #include using namespace std; // hash table size, needs HSIZE*4 bytes of memory (512 MB). // To reduce memory usage, use a smaller power of 2. This may cause the program to // miss long range matches in very large files, but won't affect smaller files. const int HSIZE=0x8000000; typedef unsigned int U32; // 32 bit unsigned int // Image - can be drawn to, and saved as a .bmp file class Image { public: Image(int w, int h): pixels(w*h*3), width(w), height(h) {} // size w by h void saveBMP(const char *filename); // Save as .bmp file // Add colors (-255 to 255) to pixel at x,y (origin at lower left) void put(int x, int y, int red, int green, int blue) { assert(x>=0); assert(x=0); assert(y255?255:c<0?0:c; c=pixels[i+1]+green; pixels[i+1]=c>255?255:c<0?0:c; c=pixels[i+2]+red; pixels[i+2]=c>255?255:c<0?0:c; } private: vector pixels; // width * height * blue-green-red (3 bytes) int width, height; // Image size in pixels void out2(FILE *f, int x) {fprintf(f, "%c%c", x, x>>8);} // Write 2 bytes void out4(FILE *f, unsigned long x) // Write 4 bytes, LSB first {fprintf(f, "%c%c%c%c", int(x), int(x>>8), int(x>>16), int(x>>24));} }; // Save as a .bmp file void Image::saveBMP(const char *filename) { FILE *f=fopen(filename, "wb"); if (!f) { perror(filename); return; } fprintf(f,"BM"); // magic number for .bmp files out4(f, 54+pixels.size()); // file size out4(f, 0); // reserved out4(f, 54); // offset to start of image (no palette) out4(f, 40); // info header size out4(f, width); // image size in pixels out4(f, height); out2(f, 1); // image planes out2(f, 24); // output bits per pixel out4(f, 0); // no compression out4(f, width*height*3); // image size in bytes out4(f, 3000); // x pixels per meter out4(f, 3000); // y pixels per meter out4(f, 0x1000000); // colors out4(f, 0x1000000); // important colors for (int i=0; i0); for (int i=0; i<256; ++i) t[i]=exp(i/c); } // argv = fv filename int main(int argc, char **argv) { try { time_t start_time=clock(); // Read file if (argc<=1) { printf( "fv 1.0 - file statistics visualizer\n" "(C) 2006, Matt Mahoney\n" "This is free software under GPL, www.gnu.org/licenses/gpl.txt\n" "\n" "To use: fv filename\n" "\n" "The output is fv.bmp, a 512 x 256 image which plots the distribution of\n" "string matches of length 1, 2, 4, and 8 by location in file (x axis) and\n" "distance backwards to the previous match (y axis, log scale). The length\n" "of the match is color coded: black=1, red=2, green=4, blue=8.\n" "The file must be at least 1 byte and at most 2 GB\n"); return 1; } FILE *f=fopen(argv[1], "rb"); if (!f) { perror(argv[1]); return 1; } fprintf(stderr, "Reading %s\n", argv[1]); fseek(f, 0, SEEK_END); const double size=ftell(f); if (size<1) { fprintf(stderr, "file is either too big or empty\n"); return 1; } // Create blank white image const int width=512; const int height=256; fprintf(stderr, "Drawing fv.bmp %d by %d from %s (%1.0f bytes)\n", width, height, argv[1], size); Image g(width, height); for (int i=0; i=2 && width>=2) for (int i=1; i index(HSIZE); // hash -> checksum (2 high bits), location (30 bits) for (int i=0; i<4; ++i) { const int start_pass=clock(); fseek(f, 0, SEEK_SET); U32 h=0; double xd=y_label_width-xscale; if (i>=2) memset(&index[0], 0, index.size()*sizeof(U32)); for (U32 j=0; j>16)&HSIZE-1]; xd+=xscale; const U32 chk=h&0xc0000000; // 2 bit hash checksum if (p>chk && p1 || rand()