I have a folder of many word document files ending with .doc and .docx.
This code is working only for .docx I want this for .doc also
import docx
import os
charCounts = {}
directory = os.fsencode('.')
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".docx"):
#filename = os.path.join(directory, filename)
doc = docx.Document(filename)
chars = sum(len(p.text) for p in doc.paragraphs)
charCounts[filename] = chars / 65
# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
ws.cell(row=i 3, column=2, value=x)
ws.cell(row=i 3, column=4, value=charCounts[x])
ws.cell(row=len(charCounts) 3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)
Images:-
I want them to happen like these:
Notice two things here.
File names in excel sheet have been arranged number-wise.
Second thing is in excel sheet, the file extensions have been removed. I want it Like that.
CodePudding user response:
Here is an update to the code in your question which will do what I believe you have asked:
# uses python-docx package
import docx
import os
# uses pywin32 package
import win32com.client as win32
from win32com.client import constants
app = win32.gencache.EnsureDispatch('Word.Application')
charCounts = {}
fileDir = '.' # Put the path of the directory to be searched here
os.chdir(fileDir)
cwd = os.getcwd()
directory = os.fsencode(cwd)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.startswith('TEMP_CONVERTED_WORD_FILE_'):
continue
filenameOrig = None
if filename.endswith(".doc"):
filenameOrig = filename
src_path = os.path.join(cwd, filename)
src_path_norm = os.path.normpath(src_path)
doc = app.Documents.Open(src_path_norm)
doc.Activate()
docxPath = 'TEMP_CONVERTED_WORD_FILE_' filename[:-4] ".docx"
dest_path = os.path.join(cwd, docxPath)
dest_path_norm = os.path.normpath(dest_path)
app.ActiveDocument.SaveAs(dest_path_norm, FileFormat=constants.wdFormatXMLDocument)
doc.Close(False)
filename = docxPath
if filename.endswith(".docx"):
src_path = os.path.join(cwd, filename)
src_path_norm = os.path.normpath(src_path)
doc = docx.Document(src_path_norm)
chars = sum(len(p.text) for p in doc.paragraphs) sum(len(p.text) for section in doc.sections for hf in [section.header, section.footer] for p in hf.paragraphs)
charCounts[filenameOrig if filenameOrig else filename] = chars / 65
charCounts = {k:charCounts[k] for k in sorted(charCounts)}
# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
ws.cell(row=i 3, column=2, value=x[:-4] if x.endswith('.doc') else x[:-5])
ws.cell(row=i 3, column=4, value=charCounts[x])
ws.cell(row=len(charCounts) 3, column=3, value='Total')
ws.cell(row=len(charCounts) 3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)
Explanation:
- For every file with name ending in
.docx
except those starting withTEMP_CONVERTED_WORD_FILE_
, store character count (divided by 65) by filename as key in a dictionarycharCount
- For every file ending in
.doc
, use thepywin32
package of Win32 extensions to convert it to a.docx
file withTEMP_CONVERTED_WORD_FILE_
prepended to the filename, then store character count (divided by 65) by its original filename as key in the same dictionary as above - Replace the
charCounts
dictionary with one that has insertion order by the filename key - Iterate through
charCounts
storing the contents in an Excel file, taking care to truncate the.doc
or.docx
suffix from the filename key.