How to get no. of characters of files ending with .docx and .doc from a directory and divide each fi-CodePudding

I have a folder of many word document files ending with .doc and .docx.

This code is working only for .docx I want this for .doc also

import docx
import os

charCounts = {}
directory = os.fsencode('.')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".docx"):
        #filename = os.path.join(directory, filename)
        doc = docx.Document(filename)
        chars = sum(len(p.text) for p in doc.paragraphs)
        charCounts[filename] = chars / 65

# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active

ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
    ws.cell(row=i   3, column=2, value=x)
    ws.cell(row=i   3, column=4, value=charCounts[x])
    ws.cell(row=len(charCounts)   3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)

Images:-

I have files like these.

I want them to happen like these:

Notice two things here.

File names in excel sheet have been arranged number-wise.

Second thing is in excel sheet, the file extensions have been removed. I want it Like that.

CodePudding user response：

Here is an update to the code in your question which will do what I believe you have asked:

# uses python-docx package
import docx
import os

# uses pywin32 package
import win32com.client as win32
from win32com.client import constants
app = win32.gencache.EnsureDispatch('Word.Application')

charCounts = {}
fileDir = '.' # Put the path of the directory to be searched here
os.chdir(fileDir)
cwd = os.getcwd()
directory = os.fsencode(cwd)
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.startswith('TEMP_CONVERTED_WORD_FILE_'):
        continue
    filenameOrig = None
    if filename.endswith(".doc"):
        filenameOrig = filename
        src_path = os.path.join(cwd, filename)
        src_path_norm = os.path.normpath(src_path)
        doc = app.Documents.Open(src_path_norm)
        doc.Activate()
        docxPath = 'TEMP_CONVERTED_WORD_FILE_'   filename[:-4]   ".docx"
        dest_path = os.path.join(cwd, docxPath)
        dest_path_norm = os.path.normpath(dest_path)
        app.ActiveDocument.SaveAs(dest_path_norm, FileFormat=constants.wdFormatXMLDocument)
        doc.Close(False)
        filename = docxPath
    if filename.endswith(".docx"):
        src_path = os.path.join(cwd, filename)
        src_path_norm = os.path.normpath(src_path)
        doc = docx.Document(src_path_norm)
        chars = sum(len(p.text) for p in doc.paragraphs)   sum(len(p.text) for section in doc.sections for hf in [section.header, section.footer] for p in hf.paragraphs)
        charCounts[filenameOrig if filenameOrig else filename] = chars / 65
charCounts = {k:charCounts[k] for k in sorted(charCounts)}

# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active

ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
    ws.cell(row=i   3, column=2, value=x[:-4] if x.endswith('.doc') else x[:-5])
    ws.cell(row=i   3, column=4, value=charCounts[x])
ws.cell(row=len(charCounts)   3, column=3, value='Total')
ws.cell(row=len(charCounts)   3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)

Explanation:

For every file with name ending in .docx except those starting with TEMP_CONVERTED_WORD_FILE_, store character count (divided by 65) by filename as key in a dictionary charCount
For every file ending in .doc, use the pywin32 package of Win32 extensions to convert it to a .docx file with TEMP_CONVERTED_WORD_FILE_ prepended to the filename, then store character count (divided by 65) by its original filename as key in the same dictionary as above
Replace the charCounts dictionary with one that has insertion order by the filename key
Iterate through charCounts storing the contents in an Excel file, taking care to truncate the .doc or .docx suffix from the filename key.