I have a list of thousands of chemical formulae in a spreadsheet, and I want to count the number of times each chemical element appears in each chemical formula. Some examples are given here:
- CH3NO3
- CSe2
- C2Cl2
- C2Cl2O2
- C4H6COOH
- (C6H5)2P(CH2)6P(C6H5)2
I have found some code by @PEH (
Private RegEx As RegExp
Function CountElements(ChemFormulaRange As Variant, ElementRange As Variant) As Variant
'define variables
Dim RetValRange() As Long
Dim RetVal As Long
Dim ChemFormula As String
Dim npoints As Long
Dim i As Long
Dim mpoints As Long
Dim j As Long
' Connvert input ranges to variant arrays
If TypeName(ChemFormulaRange) = "Range" Then ChemFormulaRange = ChemFormulaRange.Value
If TypeName(ElementRange) = "Range" Then ElementRange = ElementRange.Value
'parameter
npoints = UBound(ChemFormulaRange, 1) - LBound(ChemFormulaRange, 1) 1
mpoints = UBound(ElementRange, 2) - LBound(ElementRange, 2) 1
'dimension array
ReDim RetValRange(1 To npoints, 1 To mpoints)
If RegEx Is Nothing Then
Set RegEx = New RegExp
' apply the properties
End If
'calculate all values
For j = 1 To mpoints
Element = ElementRange(1, j)
For i = 1 To npoints
RetVal = 0
ChemFormula = ChemFormulaRange(i, 1)
Call ChemRegex(ChemFormula, Element, RetVal, RegEx)
RetValRange(i, j) = RetVal
Next i
Next j
'output answer
CountElements = RetValRange
End Function
Private Sub ChemRegex(ChemFormula, Element, RetVal, RegEx)
'ChemRegex created by PEH (CC BY-SA 4.0) https://stackoverflow.com/a/46091904/17194644
With RegEx
.Global = True
.MultiLine = True
.IgnoreCase = False
End With
'first pattern matches every element once
RegEx.Pattern = "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
Dim Matches As MatchCollection
Set Matches = RegEx.Execute(ChemFormula)
Dim m As Match
For Each m In Matches
If m.SubMatches(0) = Element Then
RetVal = RetVal IIf(Not m.SubMatches(1) = vbNullString, m.SubMatches(1), 1)
End If
Next m
End Sub
CodePudding user response:
You might get a performance improvement from extracting all the elements in one regex execution rather that for only one element at a time.
Option Explicit
Sub Demo()
Dim lastrow As Long, lastcol As Long
Dim c As Long, r As Long, d As Object
Dim f As String, el As String, ar
Set d = CreateObject("Scripting.Dictionary")
With Sheet1
lastrow = .Cells(.Rows.Count, "A").End(xlUp).Row
lastcol = .Cells(1, .Columns.Count).End(xlToLeft).Column
ar = .Cells(1, 1).Resize(lastrow, lastcol)
For r = 2 To lastrow
f = ar(r, 1)
Call parse(d, f)
For c = 2 To lastcol
el = ar(1, c)
If d.exists(el) Then
ar(r, c) = d(el)
End If
Next
d.RemoveAll
Next
.Cells(1, 1).Resize(lastrow, lastcol) = ar
End With
MsgBox "Done"
End Sub
Sub parse(ByRef dict, s As String)
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = False
.Pattern = "[(]([^)] )[)](\d )"
End With
' expand bracket into multiple entries
Dim m, matches, sm, n As Long, el As String
Do While regex.test(s)
Set m = regex.Execute(s)
For n = 1 To m(0).submatches(1)
s = s & " " & m(0).submatches(0)
Next
s = regex.Replace(s, "")
Loop
' count elements
regex.Pattern = "(" & Symbols & ")([0-9]*)"
regex.Global = True
If regex.test(s) Then
Set matches = regex.Execute(s)
For Each m In matches
el = m.submatches(0)
n = Val(m.submatches(1))
If n = 0 Then n = 1
dict(el) = dict(el) n
Next
End If
End Sub
Function Symbols() As String
Symbols = "A[cglmrstu]|" & _
"B[aehikr]?|" & _
"C[adeflmnorsu]?|" & _
"D[bsy]|" & _
"E[rsu]|" & _
"F[elmr]?|" & _
"G[ade]|" & _
"H[efgos]?|" & _
"I[nr]?|" & _
"K[r]?|" & _
"L[airuv]|" & _
"M[cdgnot]|" & _
"N[abdehiop]?|" & _
"O[gs]?|" & _
"P[abdmortu]?|" & _
"R[abefghnu]|" & _
"S[bcegimnr]?|" & _
"T[abcehilms]|" & _
"[UVW]|" & _
"X[e]|" & _
"Y[b]?|" & _
"Z[nr]"
End Function
CodePudding user response:
Looks like you adapted your code to reuse the RegExp object as I suggested last time, and indeed I'd expect that to improve performance considerably. However I should have explained better how to implement but see the example below.
In the example I also took your second RegExp pattern but reworked the rest. This example seems to work for me with your sample data but that's all I've tested!
Option Explicit
Private regEx As RegExp
Private regEx2 As RegExp
Sub Test()
' formulas in A2:A7 and elements in B1:H1 (see OP's screenshot), return results in B2:H7
Range("B2:H7").Value = CountElements(Range("A2:A7").Value, Range("B1:H1"))
End Sub
Function CountElements(ChemFormulaRange As Variant, ElementRange As Variant) As Variant
Dim RetValRange() As Long
Dim RetVal As Long
Dim ChemFormula As String
Dim i As Long, j As Long
Dim mpoints As Long, npoints As Long
Dim Element As String
If regEx Is Nothing Then
Set regEx = New RegExp
With regEx
.Global = True
'.MultiLine = True ' ? only if working with multilines
.IgnoreCase = False
'first pattern matches every element once
.Pattern = "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
End With
Set regEx2 = New RegExp
With regEx2
.Global = True
'.MultiLine = True ?
.IgnoreCase = False
'second patternd finds parenthesis and multiplies elements within
.Pattern = "(\((. ?)\)([0-9]) ) ?"
End With
End If
' Convert input ranges to variant arrays
If TypeName(ChemFormulaRange) = "Range" Then ChemFormulaRange = ChemFormulaRange.Value
If TypeName(ElementRange) = "Range" Then ElementRange = ElementRange.Value
'parameter
npoints = UBound(ChemFormulaRange, 1) - LBound(ChemFormulaRange, 1) 1
mpoints = UBound(ElementRange, 2) - LBound(ElementRange, 2) 1
'dimension arrays
ReDim RetValRange(1 To npoints, 1 To mpoints)
'calculate all values
For i = 1 To npoints
ChemFormula = ChemFormulaRange(i, 1)
For j = 1 To mpoints
RetVal = 0
Element = ElementRange(1, j)
Call ChemRegex(ChemFormula, Element, RetVal)
RetValRange(i, j) = RetVal
Next
Next
'output answer
CountElements = RetValRange
' Set regEx = Nothing: Set regEx2 = Nothing
End Function
Private Sub ChemRegex(ChemFormula, Element, RetVal)
Dim Matches As MatchCollection, Matches2 As MatchCollection
Dim m As Match, m2 As Match
Set Matches = regEx.Execute(ChemFormula)
For Each m In Matches
If m.SubMatches(0) = Element Then
RetVal = RetVal IIf(Not m.SubMatches(1) = vbNullString, m.SubMatches(1), 1)
End If
Next m
If InStr(1, ChemFormula, "(") Then ' if the formula includes elements within parentheses
Set Matches2 = regEx2.Execute(ChemFormula)
For Each m2 In Matches2
Set Matches = regEx.Execute(m2.Value)
For Each m In Matches
If m.SubMatches(0) = Element Then
If m.SubMatches(1) = vbNullString Then
RetVal = RetVal m2.SubMatches(2) - 1
Else
RetVal = RetVal m.SubMatches(1) * (m2.SubMatches(2) - 1)
End If
End If
Next
Next m2
End If
End Sub
This could certainly be improved a little more by testing all the elements in one RegExp exectution as suggested by CDP1802, but I'll leave that to you!