CSV Column Discretization with Java

When mining a large amount of data, often times you end up with a lot of columns with continuous values. While this is the most “pure” version of the data, sometimes you want to cluster these values into bins to do things like creating histograms or just easy analysis of the distribution of the data. I ran into this when I ran a few Hadoop jobs that spit out CSVs as their output. After looking around online for a while, I couldn’t find any readily available solution, so I wrote up something really quickly to do just this.

More information about Discretization can be found on Wikipedia.

In order to use this tool, you’ll need the JavaCSV library.

To use it, execute DiscretizationTool.java, passing in the name of the csv to discretize as a command line parameter. By default, the output will be put into out.txt, but you can also pass in a second command line parameter to specify the output file. (The csv’s first line is a list of the column names with the rest of the lines being the rows of data).

The tool will create “bins” of values and count the frequency for each bin. Currently the number of bins is equal to the sqrt(maxValue – minValue). You can change that if you’d like, but that seemed to be a fairly reasonable value to start at. This value is calculated in DiscretizationTool.java’s createColumnBins() method. Here is the code, please feel free to post any questions you may have. I threw this together a while back, but haven’t had a chance to post it until now. It’s not thoroughly tested, but please feel free to make any modifications to it and use it for whatever applications you may need it for. If you find any bugs, or any feature updates, please post a comment and share .

DiscretizationTool.java

import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import com.csvreader.CsvReader;

public class DiscretizationTool {
	private CsvReader reader;
	private String filename;

	/**
	 * Opens up a new CSV to read.
	 * Because we read the file line by line, there is no way of going back.
	 * To go back, we have to reopen the file again.
	 */
	public void openFile(String filename) {
		try {
			this.filename = filename;
			reader = new CsvReader(filename);
			reader.readHeaders();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/**
	 * Restarts the reader to the top of the file again.
	 */
	private void restartFile() {
		try {
			reader = new CsvReader(filename);
			reader.readHeaders();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/**
	 * Prints out the headers and all of the rows of the file.
	 */
	public void listRecords() {
		try {
			System.out.println(Arrays.toString(reader.getHeaders()));
			while(reader.readRecord()) {
				for(int i = 0; i < reader.getColumnCount(); i++) {
					System.out.print(reader.get(i) + " ");
				}
				System.out.print("n");
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			// Cleanup and restart the file for the next go around
			reader.close();
			restartFile();
		}

	}

	/**
	 * @return The max and min values for each of the columns as well as a total row count.
	 */
	private RecordsStatistics getRecordStatistics() {
		try {
			RecordsStatistics rs = new RecordsStatistics();

			rs.headers = reader.getHeaders();

			while(reader.readRecord()) {
				rs.rowCount++;

				for(int i = 0; i < reader.getColumnCount(); i++) {
					String column = reader.getHeader(i);
					int value = Integer.parseInt(reader.get(i));
					if(rs.maxValues.get(column) == null ||
							rs.maxValues.get(column).intValue() < value) {
						rs.maxValues.put(column, value);
					}
					if(rs.minValues.get(column) == null ||
							rs.minValues.get(column).intValue() > value) {
						rs.minValues.put(column, value);
					}
				}
			}

			return rs;
		} catch (IOException e) {
			e.printStackTrace();
			return null;
		} finally {
			// Cleanup and restart the file for the next go around
			reader.close();
			restartFile();
		}
	}

	public HashMap createColumnBins() {
		RecordsStatistics rs = getRecordStatistics();
		int columnCount = rs.headers.length;
		int numberOfBins = 1;
		HashMap bins = new HashMap();

		for(int i = 0; i < columnCount; i++) {
			String column = rs.headers<i>;
			numberOfBins = (int) Math.sqrt(rs.maxValues.get(column).intValue() - rs.minValues.get(column).intValue());
			bins.put(column, ColumnBins.createColumnBins(numberOfBins,
					rs.minValues.get(column).intValue(),
					rs.maxValues.get(column).intValue()));
		}

		return bins;
	}

	private class BinIntegerPair {
		Bin bin;
		int integer;
		public BinIntegerPair(Bin bin, int integer) {
			this.bin = bin;
			this.integer = integer;
		}
	}

	/**
	 * Returns a HashTable, with an entry for each column.
	 * Within each entry is a HashTable that holds the Bin extremes and the
	 * 	frequency at which rows appear within that bin.
	 * @return HashTable>
	 */
	public HashMap> discretize() {
		HashMap allColumnBins = createColumnBins();
		System.out.println("Done creating column bins");
		String[] columns = allColumnBins.keySet().toArray(new String[allColumnBins.keySet().size()]);

		HashMap> columnBins = new HashMap>();
		for(String column : columns) {
			ArrayList bins = new ArrayList();
			for(int i = 0; i < allColumnBins.get(column).bins.length; i++) {
				bins.add(new BinIntegerPair(allColumnBins.get(column).bins<i>, 0));
			}
			columnBins.put(column, bins);
		}
		int line = 1;
		try {
			while(reader.readRecord()) {
				for(int i = 0; i < columns.length; i++) {
					int value = Integer.parseInt(reader.get(columns<i>));
					ArrayList bins = columnBins.get(columns<i>);
					for(BinIntegerPair bin : bins) {
						if(value >= bin.bin.minValue &&
								value <= bin.bin.maxValue) {
							bin.integer += 1;
							break;
						}
					}
				}

				if(line % 100000 == 0) {
					System.out.println(line);
				}

				line++;
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			// Cleanup and restart the file for the next go around
			reader.close();
			restartFile();
		}

		return columnBins;
	}

	public static void main(String[] args) {
		try {
			String inputFile = args[0];
			String outputFile = "out.txt";
			if(args.length > 1) {
				outputFile = args[1];
			}
			FileWriter writer = new FileWriter(outputFile);
			DiscretizationTool dt = new DiscretizationTool();
			dt.openFile(inputFile);
			Map> discretization = dt.discretize();
			for(Map.Entry> bins : discretization.entrySet()) {
				writer.write(bins.getKey() + ":n");
				for(BinIntegerPair bin : bins.getValue()) {
					writer.write(bin.bin.minValue + "-" + bin.bin.maxValue + ": " + bin.integer +"n");
				}
			}
			writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

ColumnBins.java

public class ColumnBins {
	public Bin[] bins;

	private ColumnBins() {}

	public static ColumnBins createColumnBins(int numBins, int minValue, int maxValue) {
		ColumnBins cb = new ColumnBins();
		Bin[] bins = new Bin[numBins];

		int separation = (maxValue - minValue) / numBins + 1;
		for(int i = 0; i < numBins; i++) {
			Bin bin = new Bin();
			bin.minValue = minValue + separation * i;
			bin.maxValue = minValue + separation * (i + 1) - 1;
			bins<i> = bin;
		}

		cb.bins = bins;
		return cb;
	}
}

RecordsStatistics.java

import java.util.HashMap;

public class RecordsStatistics {
	public int rowCount;
	public HashMap maxValues;
	public HashMap minValues;
	public String[] headers;

	public RecordsStatistics() {
		maxValues = new HashMap();
		minValues = new HashMap();
	}

	public String toString() {
		String s = "";
		s += "Row Count: " + rowCount;

		s += "Max Values:";
		s += maxValues.toString();

		s += "Min Values:";
		s += minValues.toString();

		return s;
	}
}

Bin.java

public class Bin {
	public int maxValue;
	public int minValue;

	public boolean equals(Object o) {
		if(o instanceof Bin) {
			if(((Bin) o).maxValue == this.maxValue &&
					((Bin) o).minValue == this.minValue) {
				return true;
			} else {
				return false;
			}
		} else {
			return false;
		}
	}

	public int hashCode() {
		int result = 17;
		result = 37*result + maxValue;
		result = 37*result + minValue;
		return result;
	}
}

Continue the conversation by sharing your comments here on the blog and by following us on Twitter @CTCT_API

Comments

  1. prasanna says:

    showing the error for intValue()

  2. prasanna says:

    cannot find symbol
    symbol: variable bins
    location: class java.lang.Object

    in discretize()

  3. Mhs says:

    hi, I am trying to use your code , but I don’t know how should I add ColumnBins.java and other classes to the descritization class.. can you help me?

Leave a Comment