Home > Blockchain >  Analysis on the basis of comparison of 1st column of 1 files with 1st column of N number of files an
Analysis on the basis of comparison of 1st column of 1 files with 1st column of N number of files an

Time:02-01

I have tab separated files and need to compare FILE_1 with N (10) files, If the IDS of column 1 of first file match with the 1st column of other files print file 1 and value of the other files and if the IDS not presnt , first file and NA to the column of other file. The example of the input and expected output file are given below.

File 1

A   1.1  0.2    0.3     1.1
B   1.3  2.1    0.2     0.1
C   1.8  0.5    2.6     3.8
D   1.2  5.1    1.7     0.1
E   1.9  4.3    2.8     1.6
F   1.6  5.1    2.9     7.1
G   1.8  2.8    0.3     3.7
H   1.9  3.6    3.7     0.1
I   1.0  2.4    4.9     2.5
J   1.1  2.0    0.1     0.4

File 2

A   d1   Q2    Q.3  E.1
B   a.3  S.1   A.2  R.1
J   a.1  2.0   031  4a4

File 3

E    1d9    4a3  2A8    1D6
F    1a.6   5a1  2W9    7Q1
J    QA8    1.8  0W3    3E7

File 4

F   1aa  5a   2Q    7WQ
G   ac   UW   0QW   3aQ
A   QQ   aws  AW    qw

I have tried the following code with two file initially but not getting the expected output

awk  '
        FILENAME == "File_2" {
                id = $0
                val[id] = $2","$3","$5
        }
        FILENAME == "File_1" {
                id = $1
                string
                if (val[id] == "") {
                        print id " " "NA"
                } else {
                        print id " " val[id]
                }
        }
' File_2 File_1 

The above code print the File_2 and NA at the end of each line.

My expected output is looks like below

Final Expected Output

A  1.1  0.2  0.3  1.1  d1   Q2   Q.3  E.1  NA    NA   NA   NA   QQ   aws  AW   qw
B  1.3  2.1  0.2  0.1  a.3  S.1  A.2  R.1  NA    NA   NA   NA   NA   NA   NA   NA
C  1.8  0.5  2.6  3.8  NA   NA   NA   NA   NA    NA   NA   NA   NA   NA   NA   NA
D  1.2  5.1  1.7  0.1  NA   NA   NA   NA   NA    NA   NA   NA   NA   NA   NA   NA
E  1.9  4.3  2.8  1.6  NA   NA   NA   NA   1d9   4a3  2A8  1D6  NA   NA   NA   NA
F  1.6  5.1  2.9  7.1  NA   NA   NA   NA   1a.6  5a1  2W9  7Q1  1aa  5a   2Q   7WQ
G  1.8  2.8  0.3  3.7  NA   NA   NA   NA   NA    NA   NA   NA   ac   UW   0QW  3aQ
H  1.9  3.6  3.7  0.1  NA   NA   NA   NA   NA    NA   NA   NA   NA   NA   NA   NA
I  1.0  2.4  4.9  2.5  NA   NA   NA   NA   NA    NA   NA   NA   NA   NA   NA   NA
J  1.1  2.0  0.1  0.4  a.1  2.0  031  4a4  QA8   1.8  0W3  3E7  NA   NA   NA   NA

CodePudding user response:

Using GNU awk for arrays of arrays, ARGIND, and gensub():

$ cat tst.awk
BEGIN { FS=OFS="\t" }
ARGIND < (ARGC-1) {
    key = $1
    sub("[^"FS"] "FS"?","")
    fileNrsKeys2vals[ARGIND][key] = $0
    fileNrs2numFlds[ARGIND] = NF
    next
}
{
    printf "%s", $0
    for ( fileNr=1; fileNr<ARGIND; fileNr   ) {
        if ( fileNr in fileNrs2numFlds ) {
            numFlds = fileNrs2numFlds[fileNr]
            printf "%s", ( $1 in fileNrsKeys2vals[fileNr] ?
                OFS fileNrsKeys2vals[fileNr][$1] :
                gensub(/ /,OFS"NA","g",sprintf("%*s",numFlds,"")) )
        }
    }
    print ""
}

$ awk -f tst.awk file2 file3 file4 file1
A       1.1     0.2     0.3     1.1     d1      Q2      Q.3     E.1     NA      NA      NA      NA      QQ      aws     AW      qw
B       1.3     2.1     0.2     0.1     a.3     S.1     A.2     R.1     NA      NA      NA      NA      NA      NA      NA      NA
C       1.8     0.5     2.6     3.8     NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA
D       1.2     5.1     1.7     0.1     NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA
E       1.9     4.3     2.8     1.6     NA      NA      NA      NA      1d9     4a3     2A8     1D6     NA      NA      NA      NA
F       1.6     5.1     2.9     7.1     NA      NA      NA      NA      1a.6    5a1     2W9     7Q1     1aa     5a      2Q      7WQ
G       1.8     2.8     0.3     3.7     NA      NA      NA      NA      NA      NA      NA      NA      ac      UW      0QW     3aQ
H       1.9     3.6     3.7     0.1     NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA
I       1.0     2.4     4.9     2.5     NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA      NA
J       1.1     2.0     0.1     0.4     a.1     2.0     031     4a4     QA8     1.8     0W3     3E7     NA      NA      NA      NA

CodePudding user response:

This solution requires a " | sort" since awk arrays are not guaranteed to be in order. It also is sensitive to the number of spaces immediately following the index letter ("A", "B", "C", etc.):

Mac_3.2.57$cat mergeLinesV0.awk
BEGIN {
 i1=1
 i2=1
 i3=1
 i4=1
} NR == FNR {
  ar1[i1]=$0
  i1=i1 1
  f1size=FNR
  next
}{
  f1done=1
} NR-f1size == FNR && f1done {
  ar2[i2]=$0
  i2=i2 1
  f2size=FNR
  next
}{
  f2done=1
} NR-f1size-f2size == FNR && f2done {
  ar3[i3]=$0
  i3=i3 1
  f3size=FNR
  next
}{
  f3done=1
} NR-f1size-f2size-f3size == FNR && f3done {
  ar4[i4]=$0
  i4=i4 1
  f4size=FNR
  next
} END {
  for(i1 in ar1){
    printf("%s   ", ar1[i1])
    found2=0
    for(i2 in ar2){
      if(substr(ar1[i1],1,1)==substr(ar2[i2],1,1)){
        printf("%s   ", substr(ar2[i2],5))
        found2=1
      }
    }
    if(!found2){
      printf("NA   NA   NA   NA   ")
    }
    found3=0
    for(i3 in ar3){
      if(substr(ar1[i1],1,1)==substr(ar3[i3],1,1)){
        printf("%s   ", substr(ar3[i3],5))
        found3=1
      }
    }
    if(!found3){
      printf("NA   NA   NA   NA   ")
    }
    found4=0
    for(i4 in ar4){
      if(substr(ar1[i1],1,1)==substr(ar4[i4],1,1)){
        printf("%s\n", substr(ar4[i4],5))
        found4=1
      }
    }
    if(!found4){
      printf("NA   NA   NA   NA\n")
    }
  }
}

Mac_3.2.57$awk -f mergeLinesV0.awk File1 File2 File3 File4 | sort
A   1.1  0.2    0.3     1.1   d1   Q2    Q.3  E.1   NA   NA   NA   NA   QQ   aws  AW    qw
B   1.3  2.1    0.2     0.1   a.3  S.1   A.2  R.1   NA   NA   NA   NA   NA   NA   NA   NA
C   1.8  0.5    2.6     3.8   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
D   1.2  5.1    1.7     0.1   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
E   1.9  4.3    2.8     1.6   NA   NA   NA   NA   1d9    4a3  2A8    1D6   NA   NA   NA   NA
F   1.6  5.1    2.9     7.1   NA   NA   NA   NA   1a.6   5a1  2W9    7Q1   1aa  5a   2Q    7WQ
G   1.8  2.8    0.3     3.7   NA   NA   NA   NA   NA   NA   NA   NA   ac   UW   0QW   3aQ
H   1.9  3.6    3.7     0.1   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
I   1.0  2.4    4.9     2.5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
J   1.1  2.0    0.1     0.4   a.1  2.0   031  4a4   QA8    1.8  0W3    3E7   NA   NA   NA   NA

Mac_3.2.57$cat File1
A   1.1  0.2    0.3     1.1
B   1.3  2.1    0.2     0.1
C   1.8  0.5    2.6     3.8
D   1.2  5.1    1.7     0.1
E   1.9  4.3    2.8     1.6
F   1.6  5.1    2.9     7.1
G   1.8  2.8    0.3     3.7
H   1.9  3.6    3.7     0.1
I   1.0  2.4    4.9     2.5
J   1.1  2.0    0.1     0.4
Mac_3.2.57$cat File2
A   d1   Q2    Q.3  E.1
B   a.3  S.1   A.2  R.1
J   a.1  2.0   031  4a4
Mac_3.2.57$cat File3
E   1d9    4a3  2A8    1D6
F   1a.6   5a1  2W9    7Q1
J   QA8    1.8  0W3    3E7
Mac_3.2.57$cat File4
F   1aa  5a   2Q    7WQ
G   ac   UW   0QW   3aQ
A   QQ   aws  AW    qw
Mac_3.2.57$
  • Related