Sunday, 8 April 2012

CSV handling in Javascript

The other day I was making modifications to a front end app that displayed CSV content in a text box, I really wanted to make this look a little nicer so I started to try and parse the CSV in a meaningful way to build a table and apply some styles...

My first attempts involved using regular expressions and took only a few minutes to write and run, the problem was that the source of the CSVs could not be trusted to put together "safe" CSV file, the format changed from call to call, some values were quoted others were not... it turned into a bugger real fast.

So I sat down and wrote a somewhat more stable CSV parser in javascript to make my life a little easier.

Below is the simple version (I have since added other safety checks and error handling for malformed CSV files) Other changes I plan to make include:

  • Optional escape character
  • Support for double quote escaping
  • Whitespace around delimiter support
  • Option to specify a none comma delimiter

While stepping through the string 1 char at a time may seem slow and cumbersome it really isn't. I have run this on 30k CSV files and it processes in just over a hundred milliseconds - there may be a lot of operations but they are all very simple.

function csvSplit(csv){
  csv = csv.replace(/\r\n/g,'\n')
  var rows = csv.split("\n");
  for (var i=0; i<rows.length; i++){
    var row = rows[i];
    rows[i] = new Array();
    var state = '';
    var col = '';
    for (var j=0; j<row.length; j++){
      var char = row.charAt(j);
      if (j < row.length) {
        var next = row.charAt(j+1);
      }
      else {
        var next = '';
      }
      if (col=='') {
        if (char==',') { // empty value
          rows[i][rows[i].length] = col;
          continue;
        }
        if (char=='"') { // start of double quote value
          if (next=='"') {
            rows[i][rows[i].length] = col;
            j++;
          }
          else {
            state = '"';
          }
          continue;
        }
        if (char=='\'') { // start of single quote value
          if (next=='\'') {
            rows[i][rows[i].length] = col;
            j++;
          }
          else { 
            state = '\'';
          }
          continue;
        }
        else { // start of normal value
          if (char=='\\') {
            col += next;
            j++;
          }
          else {
            col += char;
          }
          continue;
        }
      }
      else { // in value processing
        if (state=='"') {
          if (char=='\\') {
            col += next;
            j++;
            continue;
          }
          if (char=='"') {
            rows[i][rows[i].length] = col;
            col = '';
            if (next==',') {
              j++;
            }
            continue;
          }
          else {
            col += char;
          }
          continue;
        }
        if (state=='\'') {
          if (char=='\\') {
            col += next;
            j++;
            continue;
          }
          if (char=='\'') {
            rows[i][rows[i].length] = col;
            col = '';
            if (next==',') {
              j++;
            }
            continue;
          }
          else {
            col += char;
          }
          continue;
        }
        else {
          if (char=='\\') {
            col += next;
            j++;
            continue;
          }
          if (char==',') {
            rows[i][rows[i].length] = col;
            col = '';
            continue;
          }
          else
            col += char;
          }
          continue;
        }
    }
    rows[i][rows[i].length] = col;
  }
  return rows;
}

mydata = csvSplit(get_data())

You can think of this function as returning an array lines and each line is a array of column values.

The regular expression version (runs well but is difficult to extend to cope with certain error conditions) achieves the same goal by using entity encoding to protect special characters then using those characters to replace tokens (specifically the start and stop quotes) so that internal commas can be correctly escaped.

function csvSplit(csv){
  csv = csv.replace(/\r\n/g,'\n')
  var rows = csv.split("\n");
  for (var i=0; i<rows.length; i++){
    var row = rows[i];
    rows[i] = new Array();
    row = row.replace(/&/g, "&amp;");
    row = row.replace(/\\\\/g, "&#92;");
    row = row.replace(/\\"/g, "&quot;");
    row = row.replace(/\\'/g, "&#39;");
    row = row.replace(/\\,/g, "&#44;");
    row = row.replace(/@/g, "&#64;");
    row = row.replace(/\?/g, "&#63;");
    row = row.replace(/"([^"]*)"/g, "@$1\?");
    while (row.match(/@([^\?]*),([^\?]*)\?/)){
      row = row.replace(/@([^\?]*),([^\?]*)\?/g, "@$1&#44;$2?");
    }
    row = row.replace(/[\?@]/g, "");
    row = row.replace(/\'([^\']*)\'/g, "@$1\?");
    while (row.match(/@([^\?]*),([^\?]*)\?/)){
      row = row.replace(/@([^\?]*),([^\?]*)\?/g, "@$1&#44;$2?");
    }
    row = row.replace(/[\?@]/g, "");
    row = row.split(",")
    for (var j=0; j<row.length; j++){
      col = row[j];
      col = col.replace(/&#63;/g, "\?");
      col = col.replace(/&#64;/g, "@");
      col = col.replace(/&#44;/g, ",");
      col = col.replace(/&#39;/g, '\'');
      col = col.replace(/&quot;/g, '\"');
      col = col.replace(/&#92;/g, '\\');
      col = col.replace(/&amp;/g, "&");
      row[j]=col;
    }
    rows[i] = row;
  }
  return rows;
}






No comments:

Post a Comment