Let's say that we've read a python file with multiple lines of comments and then some code. This is stored in data
as a list
or np.ndarray
data = ["# this", "# is" "# the first comment", "print('hello world')", "# second comment"]
expected_output = ["```this is the first comment```", "print('hello world')", "``` second comment```"]
expected_output
The desired output will replace the multiple elements starting with a #
character with the single parsed comment wrapped in the backtick
characters
['```this is the first comment```',
"print('hello world')",
'``` second comment```']
I can do the parsing but I don't know how to replace the individual lines with the newly formatted single lines (e.g. index [0, 1, 2]
in the example above).
The script so far:
from pathlib import Path
import numpy as np
from itertools import groupby
from operator import itemgetter
def get_consecutive_group_edges(data: np.ndarray):
# https://stackoverflow.com/a/2154437/9940782
edges = []
for k, g in groupby(enumerate(data),lambda x:x[0]-x[1]):
group = (map(itemgetter(1),g))
group = list(map(int, group))
edges.append((group[0],group[-1]))
# convert ranges into group index
# https://stackoverflow.com/a/952952/9940782
group_lookup = dict(enumerate(edges))
return group_lookup
if __name__ == "__main__":
# https://stackoverflow.com/a/17141572/9940782
filedata = ["# this", "# is" "# the first comment", "print('hello world')", "# second comment"]
# find all consecutive lines starting as comments
comment_lines = np.argwhere([l[0] == "#" for l in filedata])
group_lookup = get_consecutive_group_edges(comment_lines)
output_lines = []
for comment_idx in group_lookup.keys():
# extract the comment groups
min_comment_line = group_lookup[comment_idx][0]
max_comment_line = group_lookup[comment_idx][1] 1
data = filedata[min_comment_line: max_comment_line]
# remove the comment characters
output = "".join(data).replace("\n", " ").replace("#", "")
# wrap in ```
output = "```" output "```" "\n"
I am failing at the final step: How do I replace all of the values between min_comment_line
and max_comment_line
for each group
with the single, newly parsed output
?
Can I do something with the non-commented lines?
non_comment_lines = np.argwhere([l[0] != "#" for l in filedata])
CodePudding user response:
You can assign to a list slice in Python, which can replace multiple elements with one:
...
# make a copy of the original list, so we can replace the comments
output_lines = filedata.copy()
# iterate backwards so the indices line up
for comment_idx in reversed(group_lookup):
# extract the comment groups
min_comment_line = group_lookup[comment_idx][0]
max_comment_line = group_lookup[comment_idx][1] 1
data = filedata[min_comment_line:max_comment_line]
# remove the comment characters
output = "".join(data).replace("\n", " ").replace("#", "")
# wrap in ```
output = "```" output "```"
output_lines[min_comment_line:max_comment_line] = [output]
However, the entire operation can be much simpler, since groupby
only groups consecutive matching elements:
output_lines = []
# iterate over consecutive sections of comments and code
for is_comment, lines in groupby(filedata, key=lambda x: x[0] == "#"):
if is_comment:
# remove the comment characters
output = "".join(lines).replace("\n", " ").replace("#", "")
# wrap in ```
output_lines.append("```" output "```")
else:
# leave code lines unchanged
output_lines.extend(lines)