Data Sanitizer
I wrote a C# console app to sanitize CSV files. It replaces numbers with X’s so that social security number 123456 becomes XXXXXX and the address “49 Main St” becomes “XX Main St”. Both the input and the output are streamed to ensure that the program won’t run out of memory even with arbitrarily large files.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace DataSanitizer
{
class Program
{
private const string Delimiter = "|";
private const string ReplacementCharacter = "X";
private const int NumFields = 15;
static void Main(string[] args)
{
if (args.Length < 2)
{
Console.WriteLine("Usage: DataSanitizer.exe input_file output_file");
return;
}
string inputFilePathAndName = args[0];
string outputFilePathAndName = args[1];
using (StreamReader sr = new StreamReader(inputFilePathAndName))
using (StreamWriter sw = new StreamWriter(outputFilePathAndName, true))
{
string line;
while ((line = sr.ReadLine()) != null)
{
string[] fields = line.Split(Delimiter.ToCharArray());
string ssn = fields[2];
string addressLine1 = fields[4];
string addressLine2 = fields[5];
string addressLine3 = fields[6];
string telHome = fields[9];
string telOffice = fields[10];
string telMobile = fields[11];
Regex alphanumeric = new Regex(@"\w");
Regex numeric = new Regex(@"\d");
string ssnSanitized = alphanumeric.Replace(ssn, ReplacementCharacter);
string addressLine1Sanitized = numeric.Replace(addressLine1, ReplacementCharacter);
string addressLine2Sanitized = numeric.Replace(addressLine2, ReplacementCharacter);
string addressLine3Sanitized = numeric.Replace(addressLine3, ReplacementCharacter);
string telHomeSanitized = alphanumeric.Replace(telHome, ReplacementCharacter);
string telOfficeSanitized = alphanumeric.Replace(telOffice, ReplacementCharacter);
string telMobileSanitized = alphanumeric.Replace(telMobile, ReplacementCharacter);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < NumFields; i++)
{
if (i > 0)
{
sb.Append(Delimiter);
}
switch (i)
{
case 2:
sb.Append(ssnSanitized);
break;
case 4:
sb.Append(addressLine1Sanitized);
break;
case 5:
sb.Append(addressLine2Sanitized);
break;
case 6:
sb.Append(addressLine3Sanitized);
break;
case 9:
sb.Append(telHomeSanitized);
break;
case 10:
sb.Append(telOfficeSanitized);
break;
case 11:
sb.Append(telMobileSanitized);
break;
default:
sb.Append(fields[i]);
break;
}
}
sw.WriteLine(sb.ToString());
}
}
}
}
}
One Response to “Data Sanitizer”
1 dirn 16 April 2010 @ 12:20 am
seems like we’re dealing with the same stuff but different property / value
Comments: