I have this macro that compare each word with other and check if that is the duplicate if yes then delete it, however it works really great for 1 to 4 pages with a time of at-most 5minutes. But for the document of 50 or 100 pages it took me a decade to run it.
I am in need of modification Or a new idea to compare and delete duplicates with more efficient code and less time. How should I do?
Sub Delete_Duplicates()
'***********'
'By
'MBA
'***********'
Dim AD As Range
Dim F As Range
Dim i As Long
Set AD = ActiveDocument.Range
Z = AD.Words.Count
y = 1
For i = Z To 1 Step -1
y = y 1
Set F = AD.Words(i)
On Error Resume Next
Set s = AD.Words(i - 1)
If Trim(AD.Words(i - 1)) = "," Then Set s = AD.Words(i - 2): Set c = AD.Words(i - 1)
If Err.Number > 0 Then Exit Sub
If Not F.Text = Chr(13) And UCase(Trim(F.Text)) = UCase(Trim(s.Text)) Then
F.Text = ""
If Not c Is Nothing Then c.Text = " ": Set c = Nothing
End If
If Not c Is Nothing Then Set c = Nothing
On Error Resume Next
Call ProgressBar.Progress(y / Z * 100, True) '<<-- Progress Bar
On Error GoTo 0
Next
Beep
End Sub
CodePudding user response:
It is only conception but try to prepare list of all words in document and replace double or triple words if existing.
Private Sub DeleteDuplicate()
Dim wholeTxt As String
Dim w As Range
Dim col As New Collection
Dim c
For Each w In ActiveDocument.Words
AddUniqueItem col, Trim(w.Text)
Next w
wholeTxt = ActiveDocument.Range.Text
For Each c In col
'add case with ","
'maybe one letter word should be forbidden, or add extra boundary
If InStr(1, wholeTxt, c & " " & c, vbBinaryCompare) <> 0 Then
'start of doc
Selection.HomeKey wdStory
'here should be all stuff to prepare replacement
'(...)
Selection.Find.Execute Findtext:=c & " " & c, ReplaceWith:=c
wholeTxt = ActiveDocument.Range.Text
End If
Next c
Set col = Nothing
End Sub
Private Sub AddUniqueItem(ByRef col As Collection, ByVal itemValAndKey As String)
Dim s As String
On Error Resume Next
s = col(itemValAndKey)
If Err.Number <> 0 Then
col.Add itemValAndKey, itemValAndKey
Err.Clear
End If
On Error GoTo 0
End Sub
CodePudding user response:
Assuming that the entire document is plain text, we can assign the entire document's text and use Split
to convert it into array of words.
Since it's in array, it will be faster to process through them all vs accessing the Words
collection.
This is all I can think of but perhaps there's a better way to do this? Below example uses Regex to search through and replace all matched duplicate:
Option Explicit
Sub Delete_Duplicate()
Const maxWord As Long = 2 'Change this to increase the max amount of words should be used to match as a phrase.
Dim fullTxt As String
fullTxt = ActiveDocument.Range.Text
Dim txtArr() As String
txtArr = Split(fullTxt, " ")
Dim regex As RegExp
Set regex = New RegExp
regex.Global = True
regex.IgnoreCase = True
Dim outputTxt As String
outputTxt = fullTxt
Dim n As Long
Dim i As Long
For i = UBound(txtArr) To 0 Step -1
Dim matchWord As String
matchWord = vbNullString
For n = 0 To maxWord - 1
If (i - n) < 0 Then Exit For
matchWord = txtArr(i - n) & " " & matchWord
matchWord = Trim$(Replace(matchWord, vbCr, vbNullString))
regex.Pattern = matchWord & "[, ]{0,}" & matchWord
If regex.test(outputTxt) Then
outputTxt = regex.Replace(outputTxt, matchWord)
End If
Next n
Next i
Set regex = Nothing
Application.UndoRecord.StartCustomRecord "Delete Duplicates"
ActiveDocument.Range.Text = outputTxt
Application.UndoRecord.EndCustomRecord
End Sub