Alphabetically ordering Worksheets using openpxl in WriteOnlyWorkSheet mode-CodePudding

Background:
I have created a script that takes a .xlsx file and splits the Groupings of data into separate worksheets, based on the names of each Grouping. E.g., Client A Grouped data will be written to a worksheet named Client A and so forth.

Additionally, the script strips out Excel-invalid characters, and miscellaneous strings contained within INVALID_TITLE_CHARS and INVALID_TITLE_NAMES as well as uses values in management_style and model_type to decide which rows are irrelevant and therefore skipped, during the copy process.

My issue:
I would like to expand the script so that the Worksheets are alphabetical. E.g., 'Client A, Client B, Client C, and so forth. I have tried to achieve this by client_sheet._sheets.sort(key=lambda output_workbook: output_workbook.title) however, for memory consistency reasons, I have used WriteOnlyWorksheet which seems incompatible with _sheets.

Does anyone know if there is another solution? I was hoping to avoid creating a function that reopens the.xlsx

The script:
This is the full script:

import time
import logging
import sys
import datetime as dt
import datetime
import requests as requests
import json
import enlighten
import warnings
from openpyxl import load_workbook
from openpyxl import LXML
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
from copy import copy
from configparser import ConfigParser
from requests.auth import HTTPBasicAuth

logger = logging.getLogger()
timestr = datetime.datetime.now().strftime("%Y-%m-%d")


warnings.filterwarnings("ignore")


def configure_logging():
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s - %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
INVALID_TITLE_NAMES = ["zz_ FeeRelationship", "Family"]


def clean_sheet_title(title):
    title = title or ""
    title = title.strip()
    title = title.translate(INVALID_TITLE_CHAR_MAP)
    for name in INVALID_TITLE_NAMES:
        title = title.replace(name, "")
    return title[:31]


def is_client_row(row, row_dimension):
    return row_dimension.outlineLevel == 0


def create_write_only_cell(source_cell, target_sheet):
    target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
    target_cell.data_type = source_cell.data_type

    if source_cell.has_style:
        target_cell.font = copy(source_cell.font)
        # target_cell.border = copy(source_cell.border)
        # target_cell.fill = copy(source_cell.fill)
        target_cell.number_format = copy(source_cell.number_format)
        # target_cell.protection = copy(source_cell.protection)
        target_cell.alignment = copy(source_cell.alignment)

    return target_cell


def create_write_only_row(source_row, target_sheet):
    return [create_write_only_cell(cell, target_sheet) for cell in source_row]


def skip_row(row, row_dimension):
    """
    Determine whether a row needs to be skipped and not copied to new workbook
    """

    def get_column_value(column):
        value = row[column].value or ""
        return value.strip()

    # skip total line
    if row[0].value == "Total":
        return True

    management_style = [
        "Advisory",
        "Advisory - No Fee",
        "Holding",
        "JPAS",
        "Liquidity Management",
        "Trading",
        "",
    ]
    model_type = ["Client", "Holding Account", "Holding Company", "Trust", ""]

    management_value = get_column_value(3)
    model_value = get_column_value(11)

    # Pass on either column
    return management_value not in management_style and model_value not in model_type

    # # Pass on both columns
    # return management_value not in management_style or model_value not in model_type


def split_workbook(input_file, output_file):
    """
    Split workbook each client into its own sheet.
    """
    try:
        logger.info(f"Loading workbook {input_file}")
        workbook = load_workbook(input_file)
        data_sheet = workbook.active
        output_workbook = Workbook(write_only=True)
        client_sheet = None
        client_row_index = 2
        processing_client = 0
        skip_child = False
        skipped_parent_outline_level = 0
        skipped_rows_per_client = 0

        rows = data_sheet.rows
        header = next(rows)
        for index, row in enumerate(rows, start=2):
            row_dimension = data_sheet.row_dimensions[index]

            # verify whether current row is a child of skipped parent
            if skip_child and skipped_parent_outline_level < row_dimension.outlineLevel:
                skipped_rows_per_client  = 1
                continue

            # reset skip_child when current row is not a child of skipped parent anymore
            if (
                skip_child
                and skipped_parent_outline_level >= row_dimension.outlineLevel
            ):
                skip_child = False

            # check whether row needs to be skipped
            if skip_row(row, row_dimension):
                skipped_rows_per_client  = 1
                skip_child = True
                skipped_parent_outline_level = row_dimension.outlineLevel
                continue

            # create new sheet found new client is found
            if is_client_row(row, row_dimension):
                skipped_rows_per_client = 0
                processing_client  = 1
                client_sheet_title = clean_sheet_title(row[0].value)
                logger.info(f"Processing client {processing_client}")
                client_sheet = output_workbook.create_sheet(client_sheet_title)
                client_row_index = index

                # copy column dimensions
                for key, column_dimension in data_sheet.column_dimensions.items():
                    client_sheet.column_dimensions[key] = copy(column_dimension)
                    client_sheet.column_dimensions[key].worksheet = client_sheet

                client_sheet.append(create_write_only_row(header, client_sheet))

            # copy row dimensions
            new_row_index = index - skipped_rows_per_client - client_row_index   2
            client_sheet.row_dimensions[new_row_index] = copy(row_dimension)
            client_sheet.row_dimensions[new_row_index].worksheet = client_sheet

            # finally copy row
            client_sheet.append(create_write_only_row(row, client_sheet))

            if index % 10000 == 0:
                logger.info(f"{index} rows processed")

        logger.info(f"Writing workbook {output_file}")
        output_workbook.save(output_file)
    finally:
        if workbook:
            workbook.close()
        if output_workbook:
            output_workbook.close()


if __name__ == "__main__":
    start = time.time()
    configure_logging()
    input_file = 'Input_File' timestr '.xlsx'
    output_file = 'Output_File' timestr '.xlsx'
    logger.info(f"Using lxml mode: {LXML}")
    split_workbook(input_file, output_file)
    logger.info("Time consumed: % s seconds" % (time.time() - start))

CodePudding user response：

I am able to achieve this using _sheets.sort() the titles. Here is an example to demonstrate the same...

import openpyxl
from openpyxl.cell import WriteOnlyCell

#Create workbook in write_only mode as you did
output_workbook = Workbook(write_only=True)

#Add 4 sheets 
ws1 = output_workbook.create_sheet("clientB_sheet_title")
ws2 = output_workbook.create_sheet("clientA_sheet_title")
ws3 = output_workbook.create_sheet("clientD_sheet_title")
ws4 = output_workbook.create_sheet("clientC_sheet_title")

#Add some data at start
cell = WriteOnlyCell(ws1, value="hello clientB")
ws1.append([cell, 3.14, None])

cell = WriteOnlyCell(ws2, value="hello clientA")
ws2.append([cell, 3.14, None])

cell = WriteOnlyCell(ws3, value="hello clientD")
ws3.append([cell, 3.14, None])

cell = WriteOnlyCell(ws4, value="hello clientC")
ws4.append([cell, 3.14, None])

### The key to your question - The sorting of titles ###
output_workbook._sheets.sort(key=lambda ws: ws.title)

#Finally save
output_workbook.save("output_file.xlsx")

Output excel

CodePudding user response：

Based on Redox's answer, I was able to throw together a super-simple function to achieve this -

def alphabetical_client_sheet_sort(output_file):
    workbook = load_workbook(output_file)
    workbook._sheets.sort(key=lambda output_file: output_file.title)
    workbook.save(output_file)
    workbook.close()