import java.io.*;
import java.util.*;
class Cluster {
public static void main (String [] args) throws IOException {

    String s,wrd;
    Hashtable<String,MyWord> h = new Hashtable<String,MyWord>();
    ArrayList<String> sourcefilelist = new ArrayList<String>();    
  

    FileInputStream stream = new FileInputStream("/mnt/nfs/netapp1/fs3/kvaradar/.public-html/sp2010/hw7input/sourcefiles");
    InputStreamReader reader = new InputStreamReader(stream);
    StreamTokenizer tokens = new StreamTokenizer(reader);


    while (tokens.nextToken() != tokens.TT_EOF) {
	s = tokens.sval;
        sourcefilelist.add(s);

    }
    stream.close();

    Hashtable<String,String> [] f = (Hashtable<String,String> []) new Hashtable[sourcefilelist.size()];
    for (int i = 0; i < f.length; i++) 
	f[i] = new Hashtable<String,String>();

    String inputfilename, inputfilenameprefix;
    inputfilenameprefix = "/mnt/nfs/netapp1/fs3/kvaradar/.public-html/sp2010/hw7input/";
    StringBuilder sb;
    MyWord w;
    int i = 0;

    for( String x: sourcefilelist ) {
       System.out.println("Reading " + x);
       sb = new StringBuilder();
       sb.append(inputfilenameprefix);
       sb.append(x);
       inputfilename = new String(sb);
     
       stream = new FileInputStream(inputfilename);
       reader = new InputStreamReader(stream);
       tokens = new StreamTokenizer(reader);


       while (tokens.nextToken() != tokens.TT_EOF) {
	      s = tokens.sval;
              if (isGoodString(s) && (s.length() > 4)) {
		  w = h.get(s);
                  if (w != null) {
                      w.multiplicity = w.multiplicity + 1;
                      if (f[i].get(s) == null) {
			  w.numfiles = w.numfiles + 1;
                          f[i].put(s,s);
                      }
                          
                      h.put(s, w);
		  }
                  else {
                      h.put(s, new MyWord(s,1,1));
                      f[i].put(s,s);
                  }
              }
       }
       stream.close();
       i++;
    }

    double current;
    UndGraph gr = new UndGraph();
    for (String x: sourcefilelist) 
	gr.addVertex(x);
  
    for (i = 0; i < sourcefilelist.size(); i++) {
	for (int j = i + 1; j < sourcefilelist.size();j++) {
	    current = inprod(f[i],f[j],h);
            gr.addEdge(sourcefilelist.get(i), sourcefilelist.get(j), 1 - current);

	}
    }
               

    /* Once you've added the mst code to the graph class, you can call it
       here to compute the mst for the document dis-similarity graph. 
       With the edges in the mst, you can compute the four clusters.
    */

}

    public static double inprod(Hashtable<String,String> f1, Hashtable<String,String> f2, Hashtable<String,MyWord> h) {
	double answer = 0;
        int firstlength = 0;
        int secondlength = 0;
        Collection<String> wordlist = f1.values();
        for (String x: wordlist) {
            MyWord w = h.get(x); 
            if ( (w.multiplicity <= 20) && (w.numfiles <=7) ) {
		     firstlength++;
	             if (f2.get(x) != null) answer = answer + 1;
	    }
        }

        wordlist = f2.values();
        for (String x: wordlist) {
            MyWord w = h.get(x); 
            if ( (w.multiplicity <= 20) && (w.numfiles <=7) ) {
		     secondlength++;
	    }
        }


        return answer/(Math.sqrt(firstlength) * Math.sqrt(secondlength));
    }



public static boolean isGoodString(String s) {



    if (s == null) return false;

    boolean isValid = true;
    int i = 0;
    while( (i < s.length()) && isValid ) {
    isValid = java.lang.Character.isLetter(s.charAt(i));
    i++;
    }
    return isValid;
}

}


class MyWord {

    String word;
    int multiplicity;
    int numfiles;

    public MyWord(String s, int x, int n) {
	word = s; multiplicity = x; numfiles = n;
    }
}


class Vertex {

    String name;
    LinkedList<Edge> edgeList ;

    public Vertex(String s) {
	name = s;
        edgeList = new LinkedList<Edge>();
    }
}

class Edge {
    Vertex origin;
    Vertex destination;
    double cost;

    // edge has no direction, so origin and destination are misnomers

    public Edge(Vertex o, Vertex d, double w) {
        origin = o;
        destination = d;
        cost = w;
    }
}

class UndGraph {
    
    Hashtable<String,Vertex> h;

    public UndGraph() {
	h = new Hashtable<String, Vertex>();
    }

    public void addVertex(String s) {
        Vertex v = new Vertex(s);
        h.put(s,v);
    }

    public void addEdge(String s1, String s2, double w) {
        Vertex u = h.get(s1);
        Vertex v = h.get(s2);
        if ( (u != null) && (v !=null) ) 
	    {
                Edge e = new Edge(u,v,w);
		u.edgeList.add(e);
                v.edgeList.add(e);
            }
    }

    public void printGraph() {
	Collection<Vertex> vertexList = h.values();

        for(Vertex v: vertexList) {
	    System.out.print(v.name + " :");
	    for(Edge e: v.edgeList) {
                if (e.origin == v) 
                    System.out.print(" " + e.destination.name);
                else
                    System.out.print(" " + e.origin.name);
	    }

            System.out.println(" ");
	}
    }


    /*
    public LinkedList<Edge> mst(){}
    */


}

    