regex substitution between two files with regex expression
Everywhere i can find only the substitution of 1 regex in a file ,Iam looking to read the regex in file A and substitute it with value in file B Tried , using this code but unable to do .
import re
import itertools
from ast import literal_eval
dict={}
daddy=(str(tuple(i.strip('\n') for i in open('CanIfcousin.h','r'))))
d = eval(str(re.split('\s ', daddy)))
content = (str(tuple(i.strip('\n') for i in open('CanIfcousin2.h','r'))))
with open('r1.h', 'w') as f:
f.write('\n'.join(re.sub('^(#define[^\S\n] \w VERSION[^\S\n] \()[A-Z0-9] (?=\))', lambda x:d[x.group(r'\g<1> 00 ')], i) for i in content))
This is the output
T h i s
d o c u m e n t
and rest of the whole file ,single character in new line a bit gibberish
example
input1 = File1
##a.h
#define CANIF_AR_RELEASE_MAJOR_VERSION (1U)
#define CANIF_AR_RELEASE_MINOR_VERSION (2U)
#define CANIF_AR_RELEASE_REVISION_VERSION (3U)
#define TEST_CANTRCV_MODULE_ID (70U) /* 0x46 */
input 2 = File2
##a.h
#define CANIF_AR_RELEASE_MAJOR_VERSION (FFFU)
#define CANIF_AR_RELEASE_MINOR_VERSION (1XFFUU)
#define CANIF_AR_RELEASE_REVISION_VERSION (2X000FFFU)
#define TEST_McalModule_CanTp_ID (70U) /* 0x46 */
OUTPUT = File3
##a.h
#define CANIF_AR_RELEASE_MAJOR_VERSION (1U)
#define CANIF_AR_RELEASE_MINOR_VERSION (2U)
#define CANIF_AR_RELEASE_REVISION_VERSION (3U)
#define TEST_McalModule_CanTp_ID (70U) /* 0x46 */
`````
CodePudding user response:
Assuming I've correctly understood your requirements (for each entry in file2, get the corresponding value in file1 if possible, else put the file2 value), I've come up with this:
import re
import os
def merger(i1, i2, o, p):
output = []
content2 = i2.readline()
while content2 != "":
result2 = p.match(content2)
try:
to_search = result2.group(1)
pattern1 = re.compile(f"#define {to_search}.*\(([A-Za-z0-9] )\)")
content1 = i1.readline()
while content1 != "":
result1 = pattern1.match(content1)
try:
to_add = result1.group(1)
output.append(result2.group(0) "(" to_add ")")
break
except:
content1 = i1.readline()
except:
output.append(content2)
content2 = i2.readline()
o.write("\n".join(output))
pattern2 = re.compile("#define ([A-Z_]*VERSION)[^\(]*")
folder1 = "data/folder1"
folder2 = "data/folder2"
folder3 = "data/folder3"
for elt in os.listdir(folder1):
file1 = open(os.path.join(folder1, elt), "r")
file2 = open(os.path.join(folder2, elt), "r")
file3 = open(os.path.join(folder3, elt), "w")
merger(file1, file2, file3, pattern2)
This should do the job. Not the cleanest way to do I think, though.
CodePudding user response:
Your task is complex, so you need to split it into several simpler tasks, solve them separately and then combine solutions into one. Basically we can split your task into 3 "subtasks":
- Compare two directories and find files with same filename;
- Find all matches of regular expression in first file;
- Generate search and replace regular expressions using each match from first file and apply them.
To not reinvent the wheel you can use
filecmp.dircmp()
to compare two directories and specifically.same_files
to get list of filenames present in both folders.from filecmp import dircmp result = dircmp(r"some/path/dir1", r"some/path/dir2").common_files
To find matches of particular regular expression in file we need to read all file content and use either
re.findall()
(if we want to get all matches in list) orre.finditer()
(if we want to work with iterator which yieldsMatch
objects). Also we can compile pattern (usingre.compile()
) and call.findall()
or.finditer()
of returnedPattern
object.You can stay with your regular expression, but in this example I'll use
#define\s*(. ?VERSION)\s*\((. ?)\)
which will return me name of defined constant in first group and it's value in second.import re pattern = re.compile(r"#define\s*(. ?VERSION)\s*\((. ?)\)") with open(r"some/path/dir1/a.h") as f: for match in pattern.finditer(f.read()): print(*match.groups())
From previous step we know that first group of match contains name and second contains value. Our task is to replace value in definition of constant with same name in another file, so we need to dynamically build regular expression to find constant with same name and also build replacement which will change just value inside parentheses.
If we want to use
re.sub()
we need to build pattern which will capture all text before value we need to change in first group, all text after value in second group and build replacement which will keep first and second group with new value. Let's say that we need to replace value of constantCANIF_AR_RELEASE_REVISION_VERSION
with3U
. Using described logic our pattern should look like this(#define\s*CANIF_AR_RELEASE_REVISION_VERSION\s*\(). ?(\).*?)
and replacement should include new value between groups\g<1>3U\g<2>
.We can declare two "format-ready" string for search pattern and replacement and use
str.format()
in code to insert values from every match found in first file. We need to read content of second file and replace it every loop iteration after replacement done. I'd recommend also to escape all special chars in obtained matches (usingre.escape()
) to prevent any possible errors.import re pattern = re.compile(r"#define\s*(. ?VERSION)\s*\((. ?)\)") search = r"(#define\s*{}\s*\(). ?(\).*?)" replace = r"\g<1>{}\g<2>" with open(r"some/path/dir1/a.h") as f1, open(r"some/path/dir2/a.h") as f2, \ open(r"some/path/dir3/a.h", "w") as out: final = f2.read() for match in pattern.finditer(f.read()): final = re.sub( search.format(re.escape(match.group(1))), replace.format(re.escape(match.group(2))), final ) out.write(final)
Now we need to combine three solutions above into a one. To all file-related operations I'll use pathlib.Path
and it's methods, but you can use also functions from os
and os.path
, check Correspondence to tools in the os
module.
import re
from pathlib import Path
from filecmp import dircmp
src_dir1 = Path(r"some/path/dir1")
src_dir2 = Path(r"some/path/dir2")
dst_dir = Path(r"some/path/dir3")
dst_dir.mkdir(parents=True, exist_ok=True)
pattern = re.compile(r"#define\s*(. ?VERSION)\s*\((. ?)\)")
search = r"(#define\s*{}\s*\(). ?(\).*?)"
replace = r"\g<1>{}\g<2>"
for fn in dircmp(src_dir1, src_dir2).common_files:
with (src_dir1 / fn).open() as f1, (src_dir2 / fn).open() as f2,\
(dst_dir / fn).open("w") as out_f:
final = f2.read()
for m in pattern.finditer(f1.read()):
final = re.sub(
search.format(re.escape(m.group(1))),
replace.format(re.escape(m.group(2))),
final
)
out_f.write(final)
Upd. From this comment I assume that author wants to modify content of files in second folder, so third folder isn't required at all, we just need to open second file in r
mode, read all content (Using .read()
) then return cursor to beginning of file (Using .seek()
), write new content and cut all remaining data (Using .truncate()
):
import re
from pathlib import Path
from filecmp import dircmp
src_dir1 = Path(r"some/path/dir1")
src_dir2 = Path(r"some/path/dir2")
pattern = re.compile(r"#define\s*(. ?VERSION)\s*\((. ?)\)")
search = r"(#define\s*{}\s*\(). ?(\).*?)"
replace = r"\g<1>{}\g<2>"
for fn in dircmp(src_dir1, src_dir2).common_files:
with (src_dir1 / fn).open() as f1, (src_dir2 / fn).open("r ") as f2:
final = f2.read()
for m in pattern.finditer(f1.read()):
final = re.sub(
search.format(re.escape(m.group(1))),
replace.format(re.escape(m.group(2))),
final
)
f2.seek(0)
f2.write(final)
f2.truncate()
You might also want to add some "backup" when modifying files. Simplest is to copy file before modification and save it's copy with same name but with adding .bak
extension using shutil.copyfile()
. Just add one line right after start of outer loop:
from shutil import copyfile
...
for fn in dircmp(src_dir1, src_dir2).common_files:
copyfile(src_dir2 / fn, src_dir2 / (fn ".bak"))
...
You can help my country, check my profile info.