The Java Program: URLtoText.java

  1 import java.io.IOException;
  2 import java.io.Reader;
  3 import java.io.BufferedReader;
  4 import java.io.InputStreamReader;
  5 import java.net.URL;
  6 import java.net.URLConnection;
  7 import java.util.List;
  8 import java.util.Arrays;
  9 import java.util.ArrayList;
 10 
 11 import javax.swing.text.html.parser.ParserDelegator;
 12 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
 13 import javax.swing.text.html.HTML.Tag;
 14 import javax.swing.text.MutableAttributeSet;
 15 
 16 public final class URLtoText {
 17    private URLtoText() {}
 18 
 19    private final static String[] stopList = {
 20       "a",
 21       "an",
 22       "and",
 23       "he",
 24       "i",
 25       "if",
 26       "is",
 27       "it",
 28       "for",
 29       "from",
 30       "of",
 31       "or",
 32       "she",
 33       "to",
 34       "that",
 35       "the",
 36       "then",
 37       "we",
 38       "with",
 39       "you",
 40    };
 41 
 42    static {
 43       Arrays.sort (stopList);
 44    }
 45 
 46    /*
 47      This boolean function compares two words.
 48    */
 49    private static boolean areTheSame (String s1, String s2) {
 50       return s1.compareTo (s2)==0;
 51    }
 52 
 53    /*
 54      This boolean function compares words in lexicographic order.
 55    */
 56    private static boolean comesBefore (String s1, String s2) {
 57       return s1.compareTo (s2)<0;
 58    }
 59 
 60    /*
 61      Determine if the word "w" in the stop list using linear search.
 62    */
 63    private static boolean inStopList (String w) {
 64       /*
 65         .... 'areTheSame (stopList[i], w)' ...
 66       */
 67       return false;
 68    }
 69 
 70    /*
 71      Determine if the word "w" is in the stop list using binary search.
 72      [To have this function called, rename it to 'inStopList'.]
 73    */
 74    private static boolean inStopList2 (String w) {
 75       // Let 'last' be the index of the last element of the collection.
 76       // ...
 77       // Let 'first' be the index of the first elment of the collection.
 78       // ...
 79 
 80       // Loop while there are still element to be searched.
 81 
 82          // Let 'middle' be the index approx hafway between
 83 
 84          // Compare the element at 'middle' with the value of the search target
 85 
 86             // If equal, return 'true'
 87 
 88             // If less than, change 'first' to be 'middle+1'
 89 
 90             // If greater than, change 'last' to be 'middle-1'
 91 
 92       // The search target has not been found.
 93       return false;
 94    }
 95 
 96    /*
 97     ***********************************************
 98     */
 99 
100    /*
101      Print all the words in a given URL if they don't appear in the
102      stop list.
103    */
104    public final static void main (String[] args) throws IOException{
105       final String url = args.length>0?args[0]:"http://www.cs.fit.edu/~pkc/";
106       final URLConnection connection = new URL (url).openConnection();
107       final Reader r = new BufferedReader (new InputStreamReader (connection.getInputStream()));
108       final List<String> words = URLtoText.extractText(r);
109       for (String word : words) {
110          if (!inStopList (word)) System.out.println(word);
111       }
112    }
113 
114 
115    public static List<String> extractText (Reader reader) throws IOException {
116       final ArrayList<String> list = new ArrayList<String>();
117       final ParserDelegator parserDelegator = new ParserDelegator();
118       final ParserCallback parserCallback = new ParserCallback() {
119             public void handleText(final char[] data, final int pos) {
120                for (String w: new String(data).toLowerCase().split("[^a-z]+")) {
121                   if (w.length()>1) list.add(w);
122                }
123             }
124             public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
125             public void handleEndTag(Tag t, final int pos) {  }
126             public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
127             public void handleComment(final char[] data, final int pos) { }
128             public void handleError(final java.lang.String errMsg, final int pos) { }
129          };
130       parserDelegator.parse(reader, parserCallback, true);
131       return list;
132    }
133 }