Python find and replace multiple comment lines in array with parsed single line comment-CodePudding

Let's say that we've read a python file with multiple lines of comments and then some code. This is stored in data as a list or np.ndarray

data = ["# this", "# is" "# the first comment", "print('hello world')", "# second comment"]

expected_output = ["```this is the first comment```", "print('hello world')", "``` second comment```"]
expected_output

The desired output will replace the multiple elements starting with a # character with the single parsed comment wrapped in the backtick characters

['```this is the first comment```',
 "print('hello world')",
 '``` second comment```']

I can do the parsing but I don't know how to replace the individual lines with the newly formatted single lines (e.g. index [0, 1, 2] in the example above).

The script so far:

from pathlib import Path
import numpy as np 
from itertools import groupby
from operator import itemgetter


def get_consecutive_group_edges(data: np.ndarray):
    # https://stackoverflow.com/a/2154437/9940782
    edges = []

    for k, g in groupby(enumerate(data),lambda x:x[0]-x[1]):
        group = (map(itemgetter(1),g))
        group = list(map(int, group))
        edges.append((group[0],group[-1]))
    
    # convert ranges into group index
    # https://stackoverflow.com/a/952952/9940782
    group_lookup = dict(enumerate(edges))

    return group_lookup

if __name__ == "__main__":

    # https://stackoverflow.com/a/17141572/9940782
    filedata = ["# this", "# is" "# the first comment", "print('hello world')", "# second comment"]

    # find all consecutive lines starting as comments
    comment_lines = np.argwhere([l[0] == "#" for l in filedata])
    group_lookup = get_consecutive_group_edges(comment_lines)

    output_lines = []
    for comment_idx in group_lookup.keys():
        # extract the comment groups
        min_comment_line = group_lookup[comment_idx][0]
        max_comment_line = group_lookup[comment_idx][1]   1
        data = filedata[min_comment_line: max_comment_line]
        
        # remove the comment characters
        output = "".join(data).replace("\n", " ").replace("#", "")
        # wrap in ```
        output = "```"   output   "```"   "\n"

I am failing at the final step: How do I replace all of the values between min_comment_line and max_comment_line for each group with the single, newly parsed output?

Can I do something with the non-commented lines?

non_comment_lines = np.argwhere([l[0] != "#" for l in filedata])

CodePudding user response：

You can assign to a list slice in Python, which can replace multiple elements with one:

    ...
    # make a copy of the original list, so we can replace the comments
    output_lines = filedata.copy()
    # iterate backwards so the indices line up
    for comment_idx in reversed(group_lookup):
        # extract the comment groups
        min_comment_line = group_lookup[comment_idx][0]
        max_comment_line = group_lookup[comment_idx][1]   1
        data = filedata[min_comment_line:max_comment_line]

        # remove the comment characters
        output = "".join(data).replace("\n", " ").replace("#", "")
        # wrap in ```
        output = "```"   output   "```"
        output_lines[min_comment_line:max_comment_line] = [output]

However, the entire operation can be much simpler, since groupby only groups consecutive matching elements:

    output_lines = []
    # iterate over consecutive sections of comments and code
    for is_comment, lines in groupby(filedata, key=lambda x: x[0] == "#"):
        if is_comment:
            # remove the comment characters
            output = "".join(lines).replace("\n", " ").replace("#", "")
            # wrap in ```
            output_lines.append("```"   output   "```")
        else:
            # leave code lines unchanged
            output_lines.extend(lines)