import java.io.*; import java.util.*; class Cluster { public static void main (String [] args) throws IOException { String s,wrd; Hashtable h = new Hashtable(); ArrayList sourcefilelist = new ArrayList(); FileInputStream stream = new FileInputStream("/mnt/nfs/netapp1/fs3/kvaradar/.public-html/sp2010/hw7input/sourcefiles"); InputStreamReader reader = new InputStreamReader(stream); StreamTokenizer tokens = new StreamTokenizer(reader); while (tokens.nextToken() != tokens.TT_EOF) { s = tokens.sval; sourcefilelist.add(s); } stream.close(); Hashtable [] f = (Hashtable []) new Hashtable[sourcefilelist.size()]; for (int i = 0; i < f.length; i++) f[i] = new Hashtable(); String inputfilename, inputfilenameprefix; inputfilenameprefix = "/mnt/nfs/netapp1/fs3/kvaradar/.public-html/sp2010/hw7input/"; StringBuilder sb; MyWord w; int i = 0; for( String x: sourcefilelist ) { System.out.println("Reading " + x); sb = new StringBuilder(); sb.append(inputfilenameprefix); sb.append(x); inputfilename = new String(sb); stream = new FileInputStream(inputfilename); reader = new InputStreamReader(stream); tokens = new StreamTokenizer(reader); while (tokens.nextToken() != tokens.TT_EOF) { s = tokens.sval; if (isGoodString(s) && (s.length() > 4)) { w = h.get(s); if (w != null) { w.multiplicity = w.multiplicity + 1; if (f[i].get(s) == null) { w.numfiles = w.numfiles + 1; f[i].put(s,s); } h.put(s, w); } else { h.put(s, new MyWord(s,1,1)); f[i].put(s,s); } } } stream.close(); i++; } double current; UndGraph gr = new UndGraph(); for (String x: sourcefilelist) gr.addVertex(x); for (i = 0; i < sourcefilelist.size(); i++) { for (int j = i + 1; j < sourcefilelist.size();j++) { current = inprod(f[i],f[j],h); gr.addEdge(sourcefilelist.get(i), sourcefilelist.get(j), 1 - current); } } /* Once you've added the mst code to the graph class, you can call it here to compute the mst for the document dis-similarity graph. With the edges in the mst, you can compute the four clusters. */ } public static double inprod(Hashtable f1, Hashtable f2, Hashtable h) { double answer = 0; int firstlength = 0; int secondlength = 0; Collection wordlist = f1.values(); for (String x: wordlist) { MyWord w = h.get(x); if ( (w.multiplicity <= 20) && (w.numfiles <=7) ) { firstlength++; if (f2.get(x) != null) answer = answer + 1; } } wordlist = f2.values(); for (String x: wordlist) { MyWord w = h.get(x); if ( (w.multiplicity <= 20) && (w.numfiles <=7) ) { secondlength++; } } return answer/(Math.sqrt(firstlength) * Math.sqrt(secondlength)); } public static boolean isGoodString(String s) { if (s == null) return false; boolean isValid = true; int i = 0; while( (i < s.length()) && isValid ) { isValid = java.lang.Character.isLetter(s.charAt(i)); i++; } return isValid; } } class MyWord { String word; int multiplicity; int numfiles; public MyWord(String s, int x, int n) { word = s; multiplicity = x; numfiles = n; } } class Vertex { String name; LinkedList edgeList ; public Vertex(String s) { name = s; edgeList = new LinkedList(); } } class Edge { Vertex origin; Vertex destination; double cost; // edge has no direction, so origin and destination are misnomers public Edge(Vertex o, Vertex d, double w) { origin = o; destination = d; cost = w; } } class UndGraph { Hashtable h; public UndGraph() { h = new Hashtable(); } public void addVertex(String s) { Vertex v = new Vertex(s); h.put(s,v); } public void addEdge(String s1, String s2, double w) { Vertex u = h.get(s1); Vertex v = h.get(s2); if ( (u != null) && (v !=null) ) { Edge e = new Edge(u,v,w); u.edgeList.add(e); v.edgeList.add(e); } } public void printGraph() { Collection vertexList = h.values(); for(Vertex v: vertexList) { System.out.print(v.name + " :"); for(Edge e: v.edgeList) { if (e.origin == v) System.out.print(" " + e.destination.name); else System.out.print(" " + e.origin.name); } System.out.println(" "); } } /* public LinkedList mst(){} */ }