Home > Blockchain >  How can I improve the execution speed of my script
How can I improve the execution speed of my script

Time:11-30

I have a file containing about 15 million records. Below is a sample of the data

99001597,555555555555,3211,Njro_Kaniani,test,NORTH,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,IN2017,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001679,555555555555,1756,Bnju_HTT,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2012,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001680,555555555555,1108,Temoni_Kiara,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2028,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001683,555555555555,1604,Blue_Bay,Nzindo,,Y,COAST,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1820,Sgerea_Makuka,Salaam,,N,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1184,Makka,Salaam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1381,Leaders_Club,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1037,Mbez,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1041,Ngano,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001683,555555555555,1313,Kichangani,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2011,OnePlus,1,N/A,Yes,Yes,Yes,N/A
99001684,555555555555,4975,Nyugusu Campp2,Test,test,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,1041,Ngano,Salam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,1420,Airport_Macro,Salaam,,Y,RAD,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,3147,Technical_Nzoti,test,ORTH,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,4488,Lumala,Mwnza,,Y,Nyeka,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google,Yes,Yes,Yes,N/A
99001684,555555555555,4975,Nyarugusu Campp2,Kigoma,,Y,Nyeka,Yes,Yes,Yes,smart,Yes,Yes,Yes,Yes,Yes,Yes,BE2026,OnePlus,1,Google

I am using the below script to count the occurrence of lines matching certain conditions. The problem is this script is very slow. I get about 200 output lines in a day. Currently, my program will read the 15 million record file 36,000 times. This is very inefficient (slow!!). How can I rework my script to just read the very large file just one time?

Desired output

1037,0,0,1,1,1,1,1,1,1,1,1,1
1041,0,0,2,2,2,2,2,2,2,2,2,2
1108,0,0,1,1,1,1,1,1,1,1,1,1
1184,0,0,1,1,1,1,1,1,1,1,1,1
1313,0,0,1,1,1,1,1,1,1,1,1,1
1381,0,0,1,1,1,1,1,1,1,1,1,1
1420,0,0,1,1,1,1,1,1,1,1,1,1
1604,0,0,1,1,1,1,1,1,1,1,1,1
1756,0,0,1,1,1,1,1,1,1,1,1,1
1820,0,0,1,1,1,1,1,1,1,1,1,1
3147,0,0,1,1,1,1,1,0,0,0,0,1
3211,0,0,1,1,1,1,1,0,0,0,0,1
4488,0,0,1,1,1,1,1,1,1,1,1,1
4975,0,0,2,2,2,2,2,1,1,0,0,1

The IDs_file file contains about 3000 records each a 4 digit number

while read i
do

twog=$(cat combined_marketing_sadm_report.csv|awk -v src=$i -F, '{if ((($10 == "Yes")||($10 == "No")) && ($3 == src) && ($9 == "No")&& ($11 == "No")) print $0;}'|wc -l)

threeg=$(cat combined_marketing_sadm_report.csv|awk -v src=$i  -F,'{if (($3 == src) &&($9 == "Yes")&& ($11 == "No")) print $0;}'|wc -l)

fourg=$(cat combined_marketing_sadm_report.csv|awk -v src=$i -F,'{if (($11 == "Yes") && ($3 == src)) print $0;}'|wc -l)
lte2100=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($13 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte800=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($14 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte700=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($15 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte1800=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($16 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte2600=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($17 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
lte900=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($18 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
threeg2100=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($24 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
threeg900=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($25 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)
volte=$(cat  combined_marketing_sadm_report.csv|awk -v src=$i  -F, '{if (($23 == "Yes") &&  ($3 == src)) print $0;}'|wc -l)

echo $i,$twog,$threeg,$fourg,$lte2100,$lte800,$lte700,$lte1800,$lte2600,$lte900,$threeg2100,$threeg900,$volte>>Raw_data_for_report.csv
done < IDs_file

CodePudding user response:

Solution: put all your loops in one awk program:

#! /usr/bin/awk -f

BEGIN {
    FS=OFS=","
    if (src=="") {
        exit
    }
    split(src,arr_src,",")
    for (i in arr_src) {
        src=arr_src[i]
        twog[src]=threeg[src]=fourg[src]=lte2100[src]=lte800[src]=lte700[src]=lte1800[src]=lte2600[src]=lte900[src]=threeg2100[src]=threeg900[src]=volte[src]=0
    }
}
{
    for (i in arr_src) {
        src=arr_src[i]
        if ($3 == src) {
            if (($10 == "Yes" || $10 == "No") && $9 == "No" && $11 == "No") twog[src]  
            if ($9  == "Yes" && $11 == "No") threeg[src]  
            if ($11 == "Yes") fourg[src]  
            if ($13 == "Yes") lte2100[src]  
            if ($14 == "Yes") lte800[src]  
            if ($15 == "Yes") lte700[src]  
            if ($16 == "Yes") lte1800[src]  
            if ($17 == "Yes") lte2600[src]  
            if ($18 == "Yes") lte900[src]  
            if ($24 == "Yes") threeg2100[src]  
            if ($25 == "Yes") threeg900[src]  
            if ($23 == "Yes") volte[src]  
        }
    }
}
END {
    for (i in arr_src) {
        src=arr_src[i]
        print src,twog[src],threeg[src],fourg[src],lte2100[src],lte800[src],lte700[src],lte1800[src],lte2600[src],lte900[src],threeg2100[src],threeg900[src],volte[src]
    }
}

Call:

./counter.awk -v src=1037,1041,4975 combined_marketing_sadm_report.csv

UPDATE

If your src values are in a file, the script (counter-v2.awk) became:

#! /usr/bin/awk -f

BEGIN {
    FS=OFS=","
}
FNR == NR {
    i  
    arr_src[i] = $0
    next
}
FNR == 1 {
    for (i in arr_src) {
        src=arr_src[i]
        twog[src]=threeg[src]=fourg[src]=lte2100[src]=lte800[src]=lte700[src]=lte1800[src]=lte2600[src]=lte900[src]=threeg2100[src]=threeg900[src]=volte[src]=0
    }
}
{
    for (i in arr_src) {
        src=arr_src[i]
        if ($3 == src) {
            if (($10 == "Yes" || $10 == "No") && $9 == "No" && $11 == "No") twog[src]  
            if ($9  == "Yes" && $11 == "No") threeg[src]  
            if ($11 == "Yes") fourg[src]  
            if ($13 == "Yes") lte2100[src]  
            if ($14 == "Yes") lte800[src]  
            if ($15 == "Yes") lte700[src]  
            if ($16 == "Yes") lte1800[src]  
            if ($17 == "Yes") lte2600[src]  
            if ($18 == "Yes") lte900[src]  
            if ($24 == "Yes") threeg2100[src]  
            if ($25 == "Yes") threeg900[src]  
            if ($23 == "Yes") volte[src]  
        }
    }
}
END {
    for (i in arr_src) {
        src=arr_src[i]
        print src,twog[src],threeg[src],fourg[src],lte2100[src],lte800[src],lte700[src],lte1800[src],lte2600[src],lte900[src],threeg2100[src],threeg900[src],volte[src]
    }
}

File IDSs_file:

1037
1041
1108
1184
1313
1381
1420
1604
1756
1820
3147
3211
4488
4975

Execute like this (Warning: Files order is very important):

./counter-v2.awk IDSs_file combined_marketing_sadm_report.csv

Output:

1037,0,0,1,1,1,1,1,1,1,1,1,1
1041,0,0,2,2,2,2,2,2,2,2,2,2
1108,0,0,1,1,1,1,1,1,1,1,1,1
1184,0,0,1,1,1,1,1,1,1,1,1,1
1313,0,0,1,1,1,1,1,1,1,1,1,1
1381,0,0,1,1,1,1,1,1,1,1,1,1
1420,0,0,1,1,1,1,1,1,1,1,1,1
1604,0,0,1,1,1,1,1,1,1,1,1,1
1756,0,0,1,1,1,1,1,1,1,1,1,1
1820,0,0,1,1,1,1,1,1,1,1,1,1
3147,0,0,1,1,1,1,1,0,0,0,0,1
3211,0,0,1,1,1,1,1,0,0,0,0,1
4488,0,0,1,1,1,1,1,1,1,1,1,1
4975,0,0,2,2,2,2,2,1,1,0,0,1
  • Related