The Java Program: URLtoText2.java

  1 import java.io.IOException;
  2 import java.io.Reader;
  3 import java.io.BufferedReader;
  4 import java.io.InputStreamReader;
  5 import java.net.URL;
  6 import java.net.URLConnection;
  7 import java.util.List;
  8 import java.util.Arrays;
  9 import java.util.ArrayList;
 10 
 11 import javax.swing.text.html.parser.ParserDelegator;
 12 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
 13 import javax.swing.text.html.HTML.Tag;
 14 import javax.swing.text.MutableAttributeSet;
 15 
 16 public final class URLtoText2 {
 17    private URLtoText2() {}
 18 
 19    private final static String[] stopList = {
 20       "a",
 21       "an",
 22       "and",
 23       "he",
 24       "i",
 25       "if",
 26       "is",
 27       "it",
 28       "for",
 29       "from",
 30       "of",
 31       "or",
 32       "she",
 33       "to",
 34       "that",
 35       "the",
 36       "then",
 37       "we",
 38       "with",
 39       "you",
 40    };
 41 
 42    static {
 43       Arrays.sort (stopList);
 44    }
 45 
 46    /*
 47      This boolean function compares two words.
 48    */
 49    private static boolean areTheSame (String s1, String s2) {
 50       return s1.compareTo (s2)==0;
 51    }
 52 
 53    /*
 54      This boolean function compares words in lexicographic order.
 55    */
 56    private static boolean comesBefore (String s1, String s2) {
 57       return s1.compareTo (s2)<0;
 58    }
 59 
 60    /*
 61      Determine if the word "w" in the stop list using linear search.
 62    */
 63    private static boolean inStopList2 (String w) {
 64       /*
 65         .... 'areTheSame (stopList[i], w)' ...
 66       */
 67       return false;
 68    }
 69 
 70    /*
 71      Determine if the word "w" is in the stop list using binary search.
 72      [To have this function called, rename it to 'inStopList'.]
 73    */
 74    private static boolean inStopList (String w) {
 75       // Let 'last' be the index of the last element of the collection.
 76       int last = stopList.length;
 77       // Let 'first' be the index of the first elment of the collection.
 78       int first = 0;
 79 
 80       // Loop while there are still element to be searched.
 81       while (first<last) {
 82 
 83          // Let 'middle' be the index approx hafway between
 84          final int middle = (first+last)/2;
 85 
 86          // Compare the element at 'middle' with the value of the search target
 87          if (areTheSame (stopList[middle], w)) {
 88             // If equal, return 'true'
 89             return true;
 90          } else if (comesBefore (stopList[middle],w)) {
 91             // If less than, change 'first' to be 'middle+1'
 92             assert first<=middle+1;
 93             first = middle+1;
 94             // If greater than, change 'last' to be 'middle-1'
 95          } else {
 96             assert middle-1<=last;
 97             last = middle-1;
 98          }
 99       }
100       // The search target has not been found.
101       return false;
102    }
103 
104    /*
105     ***********************************************
106     */
107 
108    /*
109      Print all the words in a given URL if they don't appear in the
110      stop list.
111    */
112    public final static void main (String[] args) throws IOException{
113       final String url = args.length>0?args[0]:"http://www.cs.fit.edu/~pkc/";
114       final URLConnection connection = new URL (url).openConnection();
115       final Reader r = new BufferedReader (new InputStreamReader (connection.getInputStream()));
116       final List<String> words = URLtoText.extractText(r);
117       for (String word : words) {
118          if (!inStopList (word)) System.out.println(word);
119       }
120    }
121 
122 
123    public static List<String> extractText (Reader reader) throws IOException {
124       final ArrayList<String> list = new ArrayList<String>();
125       final ParserDelegator parserDelegator = new ParserDelegator();
126       final ParserCallback parserCallback = new ParserCallback() {
127             public void handleText(final char[] data, final int pos) {
128                for (String w: new String(data).toLowerCase().split("[^a-z]+")) {
129                   if (w.length()>1) list.add(w);
130                }
131             }
132             public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
133             public void handleEndTag(Tag t, final int pos) {  }
134             public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
135             public void handleComment(final char[] data, final int pos) { }
136             public void handleError(final java.lang.String errMsg, final int pos) { }
137          };
138       parserDelegator.parse(reader, parserCallback, true);
139       return list;
140    }
141 }