Saturday, February 19, 2011

Matching two data tables using sort and Perl

At work we were comparing two measurement techniques that involved two different instrument setups, one taking digital voltage threshold values and the other taking an analog resistance measurement to detect a gross shorting in a circuit. The software interfaces to these instruments were written in LabView a long long time ago, and did not output the data in a similar format. Both measurements were done on the same set of devices, and I wanted to match up the results between the two. My original perl script for this was just a brute force linear search between the two data files, looking for matching pairs and outputting the results in a new table file. Turns out that the ~23,000 data points was a bit too much for such lazy programming, and soon realized that this simple task was taking all day. After a friend showed me how quickly he could match the data sets using C# and LINQ I realized that this script could do something similar if I used perl's "sort" function. So here is the script for future reference.


use warnings;
use strict;


open(DIGITAL, "<spider_mask_defect_database.txt");
open(ANALOG, "<spider_mask_resistance_database.txt");
open(OUTPUT, ">Spider_Mask_Measurement_Verification.txt"); 
open(OUTPUTA, ">Analog_Data.txt");
open(OUTPUTD, ">Digital_Data.txt");

print "Spider_Mask_Measurement_Validation_20101129: Script started...\n";

my %LotNames = ("FDC25-001_Res" => "FDC25-001_Retest2",
        "FDC25-002_Res" => "FDC25-002_Retest",
        "FDC25-002_Shorts_Res" => "FDC25-002_Retest",
        "FDC25-003_Cont_Res" => "FDC25-003",
        "FDC25-003_Shorts_Res" => "FDC25-003",
        "FDC25-004_Res" => "FDC25-004",
        "FDC25-005_Res" => "FDC25-005",
        "FDC25-006_Res" => "FDC25-006",
        "Spider_Mask_Res" => "Spider_Mask_PEN_Lot",
        "FDC25-007_Res" => "FDC25-007");

my %PitchNames = ("3um" => 3,
          "6um" => 6,
          "9um" => 9,
          "12um"=> 12);

my $DigitalHeader = <DIGITAL>;
my $AnalogHeader = <ANALOG>;
my $i=0;
my $j=0;
my @OutputMatrix;

$OutputMatrix[0][0] = "Lot ID";
$OutputMatrix[0][1] = "Wafer ID";
$OutputMatrix[0][2] = "Device Pitch";
$OutputMatrix[0][3] = "Row";
$OutputMatrix[0][4] = "Column";
$OutputMatrix[0][5] = "Device Site";
$OutputMatrix[0][6] = "Analog Continuity";
$OutputMatrix[0][7] = "Analog Shorts";
$OutputMatrix[0][8] = "Digital Continuity";
$OutputMatrix[0][9] = "Digital Shorts\n";

#
# Note that:
# "Lot ID" = "Lot ID"
# "Wafer ID" = "Wafer ID"
# "Device ID" = "Pitch"
# "Row" =  "Position X"
# "Column" = "Position Y"
# "Site ID" = "Site"
#

#
# In resistance database column labels are:
# |   1  |    2   |    3    | 4 |   5  |   6   |     7    |        8       |
# |Lot ID|Wafer ID|Device ID|Row|Column|Site ID|Resistance|Measurement Type|
#
# In defect database column labels are:
# |   1  |  2  | 3  |     4    |     5    |  6  |     7    |   8  |
# |Lot ID|Wafer|Site|Position X|Position Y|Pitch|Continuity|Shorts|
#

my @DIGITALDATA = ();
@DIGITALDATA = <DIGITAL>;
close(DIGITAL);

my @ANALOGDATA = ();
my @dataline = ();
# Use %LotNames to rename all analog lot labels to their digital counterparts.
while (<ANALOG>) {
  @dataline = split(/\t/, $_);
  $dataline[0] = $LotNames{$dataline[0]} or die "$dataline[0] not listed in LotNames: $!\n";
  push(@ANALOGDATA, join("\t", @dataline));
}

if ($#ANALOGDATA <= 0) {
  print "ANALOGDATA is empty\n";
  exit(-1);
}

if ($#DIGITALDATA <= 0) {
  print "DIGITALDATA is empty\n";
  exit(-1);
}

close(ANALOG);

print "Done loading database into memory. Starting data sorting and merging...\n";

my @Sorted_Digital_Data = ();
my @Sorted_Analog_Data = ();
my @Temp_Analog_Data = ();

@Temp_Analog_Data = map { (split /\t/, $_)[2] =~ /([0-9]+)um/; [$1, $_] } @ANALOGDATA;

@Sorted_Digital_Data = sort {(split /\t/, $a)[0] cmp (split /\t/,$b)[0] || (split /\t/,$a)[1] cmp (split /\t/,$b)[1] || (split /\t/,$a)[2] cmp (split /\t/,$b)[2] || (split /\t/,$a)[5] <=> (split /\t/,$b)[5] || (split /\t/,$a)[3] <=> (split /\t/,$b)[3] || (split /\t/,$a)[4] <=> (split /\t/,$b)[4]} @DIGITALDATA;

@Temp_Analog_Data = sort {(split /\t/,$a->[1])[0] cmp (split /\t/,$b->[1])[0] || (split /\t/,$a->[1])[1] cmp (split /\t/,$b->[1])[1] || (split /\t/,$a->[1])[5] cmp (split /\t/,$b->[1])[5] || $a->[0] <=> $b->[0] || (split /\t/,$a->[1])[3] <=> (split /\t/,$b->[1])[3] || (split /\t/,$a->[1])[4] <=> (split /\t/,$b->[1])[4] || (split /\t/,$a->[1])[7] cmp (split /\t/,$b->[1])[7]} @Temp_Analog_Data;

@Sorted_Analog_Data = map { $_->[1] } @Temp_Analog_Data;

# Print out sorted data into files for analysis
print OUTPUTD $DigitalHeader;
for ($i=0; $i<=$#Sorted_Digital_Data; $i++) {
  print OUTPUTD $Sorted_Digital_Data[$i];
}
print OUTPUTA $AnalogHeader;
for ($i=0; $i<=$#Sorted_Analog_Data; $i++) {
  print OUTPUTA $Sorted_Analog_Data[$i];
}

#
# Combine sorted databases into OutputMatrix then dump to file.
# OutputMatrix column labels are:
# |    0   |     1    |      2       |  3  |   4    |      5      |         6         |       7       |         8          |       9        |
# | Lot ID | Wafer ID | Device Pitch | Row | Column | Device Site | Analog Continuity | Analog Shorts | Digital Continuity | Digital Shorts |
#
my $k=1;
$i = 0;
for ($j = 0; $j <= $#Sorted_Analog_Data; $j=$j+2) {
      @dataline = split(/\t/, $Sorted_Digital_Data[$i]);
      if ( ($dataline[0] eq (split /\t/, $Sorted_Analog_Data[$j])[0]) && ($dataline[1] eq (split /\t/, $Sorted_Analog_Data[$j])[1]) ) {
    $OutputMatrix[$k][0] = $dataline[0]; # Lot ID
    $OutputMatrix[$k][1] = $dataline[1]; # Wafer ID
    $OutputMatrix[$k][5] = $dataline[2]; # Site ID
    $OutputMatrix[$k][3] = $dataline[3]; # Row
    $OutputMatrix[$k][4] = $dataline[4]; # Column
    $OutputMatrix[$k][2] = $dataline[5]; # Pitch
    $OutputMatrix[$k][8] = $dataline[6]; # Digital continuity
    $OutputMatrix[$k][9] = $dataline[7]; # Digital shorts
    $OutputMatrix[$k][6] = (split /\t/, $Sorted_Analog_Data[$j])[6]; # Analog Continuity
    $OutputMatrix[$k][7] = (split /\t/, $Sorted_Analog_Data[$j+1])[6]; # Analog Shorts
    $i = $i + 1;
    $k = $k + 1;
    }
  }

# Print sorted database to file.
for ($j=0; $j<=$#OutputMatrix; $j++) {
  for ($k=0; $k<=9; $k++) {
    if ($k<9) {
      print OUTPUT $OutputMatrix[$j][$k], "\t";
    }
    else {
      print OUTPUT $OutputMatrix[$j][$k];
    }
  }
}

close(OUTPUT);
close(OUTPUTA);
close(OUTPUTD);

print "Spider_Mask_Measurement_Validation_20101129: Script finished...\n";