I have a lookup file that I use to search the available records in file_2 and if such records are present then replace those records with #. Currently my code is substituting the entire record with # but I need to partially substitute it. I want to replace every two characters of the string with #. How can I do so? Your help will be much appreciated. Thanks
code
awk ' NR==FNR {
s = $0;
gsub("[A-Za-z0-9]","#");
a[s] = $0;
next
}
{
if match($0, ">[^<] "))
{
str = substr($0, RSTART 1, RLENGTH-1)
if (str in a )
{
$0 = substr($0, 1, RSTART) a[str] substr($0, RSTART RLENGTH)
}
}
lines[FNR]=$0
}
END {for (i=1;i<=FNR;i )
{
for (str in a )
{
regex = "\\<" str "\\>"
gsub(regex,a[str],lines[I])
}
}' lookup file_1 > file_2
cat lookup
CDX98XSD
@vanti Finserv Co.
11:11 - Capital
MS&CO(NY)
MS&CO(NY)
MS&CO(NY)
cat file_1
<html>
<body>
<hr><br><>span >Records</span><table>
<tr >
<td>@vanti Finserv Co.</td>
<td>11:11 - Capital</td>
<td>MS&CO(NY)</td>
<td>New York</td>
<td>CDX98XSD</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr >
<td>@vanti Finserv Co.</td>
<td></td>
<td>MS&CO(NY)</td>
<td>2</td>
<td>2</td>
<td>MS&CO(NY)</td>
<td>MS&CO(NY)</td>
<td></td>
</table>
</body>
</html>
expected output
<html>
<body>
<hr><br><>span >Records</span><table>
<tr >
<td>@##n## F##s##v C##</td>
<td>1##11 - C##I##l</td>
<td>M##C##N##</td>
<td>New York</td>
<td>C##9##S#</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr >
<td>@##n## F##s##v C##</td>
<td></td>
<td>M##C##N##</td>
<td>2</td>
<td>2</td>
<td>M##C##N##</td>
<td>M##C##N##</td>
<td></td>
</table>
</body>
</html>
CodePudding user response:
Assumptions/Understandings:
- duplicate entries in
lookup
can be ignored (ie, we don't treat duplicate occurrences differently) - for each white space delimited string in
lookup
we want to replace the nth/(n 1)th characters with#
(wheren
= 2,5,8,11,14,17,20,....) - for the
lookup
string11:11 - Capital
the correct replacement string is1##1# - C##i##l
(as opposed to OP's1##11 - C##i##l
)
One awk
idea:
awk '
FNR==NR { if ($0 in lookups) # if duplicate then ...
next # ignore
lookups[$0]=$0
for (i=1;i<=NF;i ) { # loop through list of white space delimited fields
oldstr=$i
newstr=""
while (oldstr) { # while oldstr != ""
len=length(oldstr)
# keep 1st char; replace 2nd/3rd chars if length > 1/2, respectively
newstr=newstr substr(oldstr,1,1) substr("##",1,len-1)
oldstr=substr(oldstr,4) # strip off first 3 characters
}
ndx=index(lookups[$0],$i) # locate position of $i in current line
# replace $i with newstr
lookups[$0]=substr(lookups[$0],1,ndx-1) newstr substr(lookups[$0],ndx length($i))
}
next
}
{ for (i in lookups) { # loop through our list of replacement strings ...
ndx=index($0,i) # look for a match in the current input line
while (ndx > 0) { # and if we find a match then replace it
$0=substr($0,1,ndx-1) lookups[i] substr($0,ndx length(lookups[i]))
ndx=index($0,i) # look for another match in the current line
}
}
print
}
# uncomment following block to display contents of lookups[]
#END { print "############ lookups[]"
# for (i in lookups)
# print i " => " lookups[i]
# print "############"
# }
' lookup file_1 > file_2
This generates:
$ cat file_2
<html>
<body>
<hr><br><>span >Records</span><table>
<tr >
<td>@##n## F##s##v C##</td>
<td>1##1# - C##i##l</td>
<td>M##C##N##</td>
<td>New York</td>
<td>C##9##S#</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr >
<td>@##n## F##s##v C##</td>
<td></td>
<td>M##C##N##</td>
<td>2</td>
<td>2</td>
<td>M##C##N##</td>
<td>M##C##N##</td>
<td></td>
</table>
</body>
</html>
Focusing on just the differences:
$ diff file_1 file_2
5,7c5,7
< <td>@vanti Finserv Co.</td>
< <td>11:11 - Capital</td>
< <td>MS&CO(NY)</td>
---
> <td>@##n## F##s##v C##</td>
> <td>1##1# - C##i##l</td>
> <td>M##C##N##</td>
9c9
< <td>CDX98XSD</td>
---
> <td>C##9##S#</td>
16c16
< <td>@vanti Finserv Co.</td>
---
> <td>@##n## F##s##v C##</td>
18c18
< <td>MS&CO(NY)</td>
---
> <td>M##C##N##</td>
21,22c21,22
< <td>MS&CO(NY)</td>
< <td>MS&CO(NY)</td>
---
> <td>M##C##N##</td>
> <td>M##C##N##</td>
Uncommenting the END{...}
block generates:
############ lookups[]
MS&CO(NY) => M##C##N##
CDX98XSD => C##9##S#
@vanti Finserv Co. => @##n## F##s##v C##
11:11 - Capital => 1##1# - C##i##l
############