Splitting Excel Data by Groupings into Separate Workbook Sheets

Question

Background:
I have a large 40MB XLSX file that contains client data which is Grouped over multiple levels, like so:

Expanded -

Not Expanded (sorry about the terrible dummy data!) -

Objective:
I would like to split Client A, B C etc... and all their respective underlying data into separate sheets (named 'Client A' etc...) in a Workbook.

Question:
Am I correct in assuming that there is no python library that would help with this (e.g., xlsxwriter) and that I will likely have to save into multiple pandas df before splitting and writing to the xlsx file?

Sample Data:
Here is a link to some randomized sample data. In this file you will see only 1 client (the total row can be ignored) however imagine the normal file having 40 clients / groupings and sub levels.

Sample Code: this function takes the '.xlsxand writes each grouping to an appropriately named tab (e.g., 'Client A') to a separate Worksheet in a new.xlsx`. The issue with this code is that because I am basically going through and copying each cell individually, I didn't think to consider more holistically however to ensure the Groupings/Levels would be preserved. I think this code needs a complete re-write, and welcome feedback

import openpyxl

from copy import copy
from openpyxl import load_workbook

columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
    for col in columns:
        ws_cell=ws1[col+str(ws_row)]
        new_cell = ws[col+str(row)]

        if ws_cell.has_style:
            new_cell.font = copy(ws_cell.font)
            new_cell.border = copy(ws_cell.border)
            new_cell.fill = copy(ws_cell.fill)
            new_cell.number_format = copy(ws_cell.number_format)
            new_cell.protection = copy(ws_cell.protection)
            new_cell.alignment = copy(ws_cell.alignment)

wb1 = openpyxl.load_workbook('annonamized_test_data_to_be_split.xlsx')

ws1=wb1.active

indexs=[]
clients=[]

index=1

while ws1['A'+str(index)]:
    if str(ws1['A'+str(index)].alignment.indent)=='0.0':
        indexs.append(index)
        clients.append(ws1['A'+str(index)].value)
    if ws1['A'+str(index)].value  is None:
        indexs.append(index)
        break
    index+=1

wb1.close()

wb = openpyxl.Workbook()

ws=wb.active

start_index=1

headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']

for y,index in enumerate(indexs):
    try:
        client=0
        if len(clients[y])>=32:
            client=clients[y][:31]
        else:
            client=clients[y]
        wb.create_sheet(client)
        ws=wb[client]
        ws.column_dimensions['A'].width=35
        ws.append(headers)
        row_index=2
        for i in range(start_index,indexs[y+1]):
            ws.append([ws1[col+str(i)].value for col in columns])
            copy_cell(ws,row_index,i,ws1)
            row_index+=1
        start_index=indexs[y+1]
    except:
        pass

wb.save('split_data.xlsx')
wb.close()

try:
    wb1 = openpyxl.load_workbook('split_data.xlsx')
    a=wb1['Sheet']
    wb1.remove(a)
    a=wb1['Sheet1']
    wb1.remove(a)
    wb1.save('split_data.xlsx')
    wb1.close()
except:
    pass

Please can someone point me in the right direction of a resource that might teach me how to achieve this?

can you post the excel here? It can be done easier if the group names are available in separate columns in a flat format — lytseeker, May 02 '22 at 04:47
@lytseeker - sure thing. I have put a link to an example `.xlsx` which only has 1 fictitious client in it. Imagine if there were 40. Extracting each of these (e.g., Falconer Family) to individual sheets is the objective. — William, May 04 '22 at 05:05
Can I provide any further information @lytseeker? I was able to create a script, but I just can't seem to preserve the hierarchy, when I split the data. — William, May 11 '22 at 22:08
Please share the sample file with the code you have written so far, and point out how your expected results differ from results you get. — lytseeker, May 12 '22 at 10:59
@lytseeker - thanks for the feedback. I have added the function that allows my output to be reproducible by you. — William, May 12 '22 at 14:38

score 1 · Accepted Answer · answered May 14 '22 at 01:12

from openpyxl import load_workbook


def get_client_rows(sheet):
    """Get client rows.

    Skip header and then look for row dimensions without outline level
    """
    return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
    return [
        row_index
        for row_index, row_dimension in sheet.row_dimensions.items()
        if row_index > 1 and row_dimension.outline_level == 0
    ]


def delete_client_block(sheet, start, end):
    """
    Delete rows starting from up to and including end.
    """
    for row in range(start, end + 1):
        sheet.row_dimensions.pop(row, None)
    sheet.delete_rows(start, end - start + 1)


def split_workbook(input_file, output_file):
    """
    Split workbook each main group into its own sheet.

    Not too loose any formatting we copy the current sheet and remove all rows
    which do not belong to extacted group.
    """

    try:
        workbook = load_workbook(input_file)
        data_sheet = workbook.active
        client_rows = get_client_rows(data_sheet)

        for index, client_row in enumerate(client_rows):
            # create new sheet for given client, shorten client as it might be too long
            client_sheet = workbook.copy_worksheet(data_sheet)
            client_sheet.title = data_sheet.cell(client_row, 1).value[:32]

            # delete rows after current client if available
            if index < len(client_rows) - 1:
                row_after_client = client_rows[index + 1]
                delete_client_block(
                    client_sheet, row_after_client, client_sheet.max_row
                )

            # delete rows before current client if available
            if index > 0:
                first_client_row = client_rows[0]
                delete_client_block(
                    client_sheet, first_client_row, client_row - first_client_row + 1
                )

                # move left over dimensions to top of the sheet
                for row_index in list(client_sheet.row_dimensions.keys()):
                    # skip header row dimension
                    if row_index > first_client_row - 1:
                        row_dimension = client_sheet.row_dimensions.pop(row_index)
                        new_index = row_index - client_row + first_client_row
                        row_dimension.index = new_index
                        client_sheet.row_dimensions[new_index] = row_dimension

        del workbook[data_sheet.title]
        workbook.save(output_file)
    finally:
        workbook.close()


if __name__ == "__main__":
    # input_file = "annonamized_test_data_to_be_split.xlsx"
    input_file = 'partial_Q1_Client_Billing_Data.xlsx'
    # output_file = "split_data.xlsx"
    output_file = "splitting_full_data.xlsx"

    split_workbook(input_file, output_file)

Splitting Excel Data by Groupings into Separate Workbook Sheets

1 Answers1