I would like to write a bash script, using awk
, to determine how many lines start with each character.
Sample input: ./script.sh txt1 txt2 text1 text2
(filenames could be random too)
txt1
asdaga
dasdag
asdasdag
awqr
zvvbrh
tqetvh
xbrrte
txt2
npoajd
pojta
pskdna
nghir
asdt
bmkgjk
Sample output:
--- txt1 ---
a : 3
b : 0
c : 0
...
z : 1
...
ascii255 : 0
--- txt2 ---
a : 1
b : 1
...
p : 2
...
--- text3 ---
etc
where [character] : [number of rows that start with that character]
is the correct format.
After printing every file one by one, I would also like to print a collective result, that follows the same format, so every charactercount
will show the sum of each textfile's characters, so in the given example (for only txt1
and txt2
) the output would be:
a : 4
b : 1
...
(epl: txt1
contains 3 lines that start with a
, txt2
contains 1 line that start with a
, so the total will be 3 1 = 4)
Here is the code that I wrote, but I am stuck, it doesn't work, I am confused with the awk
syntax:
#!/bin/bash
awk '
{split($0,arr)
n=length(arr)
for(i=1;i<=255;i ){
char[i]=0;
}
for(i=1;i<=n;i ){
actchar=substr(1,1,1);
char[actchar] ;
printf("--- %s ---\n",FILENAME);
for(j=1;j<=255;j ){
prinf("%c : %s\n",j,char[j]);
}
}
'
CodePudding user response:
This solution safely skips multi-byte characters if that's the first character; works the same for gawk
byte-mode or unicode-mode :
% pv -q < "${m3t}" | mawk2 '
function printreport(__,___,_,____) {
if (___=="") {
return ___
}
printf(" ======= %s ================\n",___)
for (_=2^3*4;_<(4^3*2-1);_ ) {
printf(" [ %s ] = %9.f | .f \n",
___=sprintf("%c",_),
__[___], ____ =__[___])
}
printf(" =====================================\n"\
" ASCII 32(spc)-126(~) sum = .f\n\n",____)
return split("",__)
}
BEGIN { FS = substr("^$",\
_ = !split(___,__))
} FNR== _ {
___=substr(FILENAME != "-" ? FILENAME \
: " /dev/fd/0 :: STDIN ", !-printreport(__,___))
} {
__[substr($!_,_,_)]
} END {
printreport(__,___) } ' "${m3l}" "${m3m}" '/dev/stdin' | ecp;
======= .../m23lyricsFLT_05.txt ================
[ ] = 7 | 7
[ ! ] = 0 | 7
[ " ] = 51 | 58
[ # ] = 62 | 120
[ $ ] = 3 | 123
[ % ] = 0 | 123
[ & ] = 0 | 123
[ ' ] = 443 | 566
[ ( ] = 1766 | 2332
[ ) ] = 2 | 2334
[ * ] = 944 | 3278
[ ] = 1 | 3279
[ , ] = 1 | 3280
[ - ] = 75 | 3355
[ . ] = 22 | 3377
[ / ] = 58 | 3435
[ 0 ] = 158142 | 161577
[ 1 ] = 2090 | 163667
[ 2 ] = 131 | 163798
[ 3 ] = 57 | 163855
[ 4 ] = 31 | 163886
[ 5 ] = 53 | 163939
[ 6 ] = 16 | 163955
[ 7 ] = 38 | 163993
[ 8 ] = 11 | 164004
[ 9 ] = 22 | 164026
[ : ] = 6 | 164032
[ ; ] = 1 | 164033
[ < ] = 158 | 164191
[ = ] = 0 | 164191
[ > ] = 3 | 164194
[ ? ] = 18 | 164212
[ @ ] = 8 | 164220
[ A ] = 1552 | 165772
[ B ] = 1407 | 167179
[ C ] = 1210 | 168389
[ D ] = 1186 | 169575
[ E ] = 570 | 170145
[ F ] = 568 | 170713
[ G ] = 796 | 171509
[ H ] = 2211 | 173720
[ I ] = 6825 | 180545
[ J ] = 397 | 180942
[ K ] = 160 | 181102
[ L ] = 1516 | 182618
[ M ] = 941 | 183559
[ N ] = 737 | 184296
[ O ] = 1640 | 185936
[ P ] = 460 | 186396
[ Q ] = 40 | 186436
[ R ] = 925 | 187361
[ S ] = 2286 | 189647
[ T ] = 2119 | 191766
[ U ] = 348 | 192114
[ V ] = 943 | 193057
[ W ] = 2353 | 195410
[ X ] = 14 | 195424
[ Y ] = 2941 | 198365
[ Z ] = 30 | 198395
[ [ ] = 3669 | 202064
[ \ ] = 0 | 202064
[ ] ] = 0 | 202064
[ ^ ] = 0 | 202064
[ _ ] = 0 | 202064
[ ` ] = 0 | 202064
[ a ] = 291 | 202355
[ b ] = 251 | 202606
[ c ] = 246 | 202852
[ d ] = 127 | 202979
[ e ] = 88 | 203067
[ f ] = 74 | 203141
[ g ] = 108 | 203249
[ h ] = 403 | 203652
[ i ] = 572 | 204224
[ j ] = 62 | 204286
[ k ] = 48 | 204334
[ l ] = 204 | 204538
[ m ] = 174 | 204712
[ n ] = 135 | 204847
[ o ] = 363 | 205210
[ p ] = 77 | 205287
[ q ] = 6 | 205293
[ r ] = 292 | 205585
[ s ] = 376 | 205961
[ t ] = 288 | 206249
[ u ] = 98 | 206347
[ v ] = 319 | 206666
[ w ] = 404 | 207070
[ x ] = 11 | 207081
[ y ] = 522 | 207603
[ z ] = 22 | 207625
[ { ] = 4 | 207629
[ | ] = 0 | 207629
[ } ] = 0 | 207629
[ ~ ] = 3 | 207632
=====================================
ASCII 32(spc)-126(~) sum = 207632
======= .../m3vid_genie26.txt ================
[ ] = 0 | 0
[ ! ] = 1 | 1
[ " ] = 4 | 5
[ # ] = 106 | 111
[ $ ] = 8 | 119
[ % ] = 1 | 120
[ & ] = 6 | 126
[ ' ] = 294 | 420
[ ( ] = 188 | 608
[ ) ] = 0 | 608
[ * ] = 5 | 613
[ ] = 2 | 615
[ , ] = 0 | 615
[ - ] = 4 | 619
[ . ] = 50 | 669
[ / ] = 0 | 669
[ 0 ] = 86 | 755
[ 1 ] = 521 | 1276
[ 2 ] = 457 | 1733
[ 3 ] = 198 | 1931
[ 4 ] = 178 | 2109
[ 5 ] = 150 | 2259
[ 6 ] = 86 | 2345
[ 7 ] = 126 | 2471
[ 8 ] = 91 | 2562
[ 9 ] = 123 | 2685
[ : ] = 0 | 2685
[ ; ] = 0 | 2685
[ < ] = 46 | 2731
[ = ] = 0 | 2731
[ > ] = 3 | 2734
[ ? ] = 6 | 2740
[ @ ] = 0 | 2740
[ A ] = 3190 | 5930
[ B ] = 4078 | 10008
[ C ] = 3279 | 13287
[ D ] = 3330 | 16617
[ E ] = 1474 | 18091
[ F ] = 2745 | 20836
[ G ] = 2337 | 23173
[ H ] = 3139 | 26312
[ I ] = 5411 | 31723
[ J ] = 981 | 32704
[ K ] = 893 | 33597
[ L ] = 4264 | 37861
[ M ] = 4134 | 41995
[ N ] = 1972 | 43967
[ O ] = 1996 | 45963
[ P ] = 2409 | 48372
[ Q ] = 94 | 48466
[ R ] = 2262 | 50728
[ S ] = 6701 | 57429
[ T ] = 5794 | 63223
[ U ] = 717 | 63940
[ V ] = 554 | 64494
[ W ] = 4119 | 68613
[ X ] = 106 | 68719
[ Y ] = 1644 | 70363
[ Z ] = 145 | 70508
[ [ ] = 20079 | 90587
[ \ ] = 0 | 90587
[ ] ] = 0 | 90587
[ ^ ] = 0 | 90587
[ _ ] = 0 | 90587
[ ` ] = 0 | 90587
[ a ] = 117 | 90704
[ b ] = 132 | 90836
[ c ] = 128 | 90964
[ d ] = 83 | 91047
[ e ] = 60 | 91107
[ f ] = 114 | 91221
[ g ] = 104 | 91325
[ h ] = 103 | 91428
[ i ] = 143 | 91571
[ j ] = 26 | 91597
[ k ] = 21 | 91618
[ l ] = 117 | 91735
[ m ] = 145 | 91880
[ n ] = 72 | 91952
[ o ] = 67 | 92019
[ p ] = 95 | 92114
[ q ] = 4 | 92118
[ r ] = 68 | 92186
[ s ] = 222 | 92408
[ t ] = 149 | 92557
[ u ] = 16 | 92573
[ v ] = 22 | 92595
[ w ] = 167 | 92762
[ x ] = 2 | 92764
[ y ] = 47 | 92811
[ z ] = 4 | 92815
[ { ] = 0 | 92815
[ | ] = 0 | 92815
[ } ] = 0 | 92815
[ ~ ] = 3 | 92818
=====================================
ASCII 32(spc)-126(~) sum = 92818
======= /dev/stdin ================
[ ] = 0 | 0
[ ! ] = 5 | 5
[ " ] = 7062 | 7067
[ # ] = 3889 | 10956
[ $ ] = 308 | 11264
[ % ] = 165 | 11429
[ & ] = 3210 | 14639
[ ' ] = 38770 | 53409
[ ( ] = 105671 | 159080
[ ) ] = 307 | 159387
[ * ] = 11556 | 170943
[ ] = 240 | 171183
[ , ] = 0 | 171183
[ - ] = 14565 | 185748
[ . ] = 27 | 185775
[ / ] = 2010 | 187785
[ 0 ] = 5489 | 193274
[ 1 ] = 51256 | 244530
[ 2 ] = 41364 | 285894
[ 3 ] = 20015 | 305909
[ 4 ] = 12961 | 318870
[ 5 ] = 9864 | 328734
[ 6 ] = 7294 | 336028
[ 7 ] = 6514 | 342542
[ 8 ] = 5800 | 348342
[ 9 ] = 5525 | 353867
[ : ] = 7 | 353874
[ ; ] = 0 | 353874
[ < ] = 2433 | 356307
[ = ] = 0 | 356307
[ > ] = 226 | 356533
[ ? ] = 17 | 356550
[ @ ] = 281 | 356831
[ A ] = 375661 | 732492
[ B ] = 331981 | 1064473
[ C ] = 271228 | 1335701
[ D ] = 270206 | 1605907
[ E ] = 144476 | 1750383
[ F ] = 262067 | 2012450
[ G ] = 158453 | 2170903
[ H ] = 204592 | 2375495
[ I ] = 501327 | 2876822
[ J ] = 119037 | 2995859
[ K ] = 94295 | 3090154
[ L ] = 280855 | 3371009
[ M ] = 312797 | 3683806
[ N ] = 160272 | 3844078
[ O ] = 160304 | 4004382
[ P ] = 197434 | 4201816
[ Q ] = 19418 | 4221234
[ R ] = 163032 | 4384266
[ S ] = 494497 | 4878763
[ T ] = 461447 | 5340210
[ U ] = 51570 | 5391780
[ V ] = 79325 | 5471105
[ W ] = 269542 | 5740647
[ X ] = 6973 | 5747620
[ Y ] = 162431 | 5910051
[ Z ] = 19564 | 5929615
[ [ ] = 36976 | 5966591
[ \ ] = 0 | 5966591
[ ] ] = 199 | 5966790
[ ^ ] = 13 | 5966803
[ _ ] = 594 | 5967397
[ ` ] = 0 | 5967397
[ a ] = 59000 | 6026397
[ b ] = 39103 | 6065500
[ c ] = 23406 | 6088906
[ d ] = 17316 | 6106222
[ e ] = 9960 | 6116182
[ f ] = 27632 | 6143814
[ g ] = 15660 | 6159474
[ h ] = 21529 | 6181003
[ i ] = 43845 | 6224848
[ j ] = 7824 | 6232672
[ k ] = 5854 | 6238526
[ l ] = 25302 | 6263828
[ m ] = 25061 | 6288889
[ n ] = 17172 | 6306061
[ o ] = 29060 | 6335121
[ p ] = 11470 | 6346591
[ q ] = 1561 | 6348152
[ r ] = 10232 | 6358384
[ s ] = 42816 | 6401200
[ t ] = 72947 | 6474147
[ u ] = 6623 | 6480770
[ v ] = 1806 | 6482576
[ w ] = 57864 | 6540440
[ x ] = 969 | 6541409
[ y ] = 38921 | 6580330
[ z ] = 1544 | 6581874
[ { ] = 272 | 6582146
[ | ] = 0 | 6582146
[ } ] = 3 | 6582149
[ ~ ] = 406 | 6582555
=====================================
ASCII 32(spc)-126(~) sum = 6582555
CodePudding user response:
This may be what you're trying to do, using any awk:
$ cat tst.sh
#!/usr/bin/env bash
awk '
{
char = substr($0,1,1)
cnt[FILENAME,char]
}
END {
OFS = " : "
beg = 97
end = 122
for ( fileNr=1; fileNr<ARGC; fileNr ) {
fname = ARGV[fileNr]
print "--- " fname " ---"
for ( charNr=beg; charNr<=end; charNr ) {
char = sprintf("%c", charNr)
print char, cnt[fname,char] 0
tot[char] = cnt[fname,char]
}
}
print "--- Total ---"
for ( charNr=beg; charNr<=end; charNr ) {
char = sprintf("%c", charNr)
print char, tot[char]
}
}
' "${@:--}"
$ ./tst.sh txt1 txt2
--- txt1 ---
a : 3
b : 0
c : 0
d : 1
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 0
o : 0
p : 0
q : 0
r : 0
s : 0
t : 1
u : 0
v : 0
w : 0
x : 1
y : 0
z : 1
--- txt2 ---
a : 1
b : 1
c : 0
d : 0
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 2
o : 0
p : 2
q : 0
r : 0
s : 0
t : 0
u : 0
v : 0
w : 0
x : 0
y : 0
z : 0
--- Total ---
a : 4
b : 1
c : 0
d : 1
e : 0
f : 0
g : 0
h : 0
i : 0
j : 0
k : 0
l : 0
m : 0
n : 2
o : 0
p : 2
q : 0
r : 0
s : 0
t : 1
u : 0
v : 0
w : 0
x : 1
y : 0
z : 1
If you want to loop over some larger range of characters just change the beg
and end
variable settings.