Home > Software design >  Extract numbers from chemical formula - including multiple elements within parentheses
Extract numbers from chemical formula - including multiple elements within parentheses

Time:12-14

I have a list of thousands of chemical formulae in a spreadsheet, and I want to count the number of times each chemical element appears in each chemical formula. Some examples are given here:

  1. CH3NO3
  2. CSe2
  3. C2Cl2
  4. C2Cl2O2
  5. C4H6COOH
  6. (C6H5)2P(CH2)6P(C6H5)2

enter image description here

I have found some code by @PEH (enter image description here

Private RegEx As RegExp

Function CountElements(ChemFormulaRange As Variant, ElementRange As Variant) As Variant

'define variables
Dim RetValRange() As Long
Dim RetVal As Long
Dim ChemFormula As String
Dim npoints As Long
Dim i As Long
Dim mpoints As Long
Dim j As Long

' Connvert input ranges to variant arrays
If TypeName(ChemFormulaRange) = "Range" Then ChemFormulaRange = ChemFormulaRange.Value
If TypeName(ElementRange) = "Range" Then ElementRange = ElementRange.Value

'parameter
npoints = UBound(ChemFormulaRange, 1) - LBound(ChemFormulaRange, 1)   1
mpoints = UBound(ElementRange, 2) - LBound(ElementRange, 2)   1

'dimension array
ReDim RetValRange(1 To npoints, 1 To mpoints)

If RegEx Is Nothing Then
    Set RegEx = New RegExp
    ' apply the properties
End If

'calculate all values
For j = 1 To mpoints
    Element = ElementRange(1, j)
        For i = 1 To npoints
        RetVal = 0
        ChemFormula = ChemFormulaRange(i, 1)
            Call ChemRegex(ChemFormula, Element, RetVal, RegEx)
        RetValRange(i, j) = RetVal
        Next i
Next j

'output answer
CountElements = RetValRange

End Function
Private Sub ChemRegex(ChemFormula, Element, RetVal, RegEx)
    
'ChemRegex created by PEH (CC BY-SA 4.0) https://stackoverflow.com/a/46091904/17194644
    
    With RegEx
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
    End With
    
    'first pattern matches every element once
    RegEx.Pattern = "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
    
    Dim Matches As MatchCollection
    Set Matches = RegEx.Execute(ChemFormula)
    
    Dim m As Match
    For Each m In Matches
        If m.SubMatches(0) = Element Then
            RetVal = RetVal   IIf(Not m.SubMatches(1) = vbNullString, m.SubMatches(1), 1)
        End If
    Next m
    
End Sub

CodePudding user response:

You might get a performance improvement from extracting all the elements in one regex execution rather that for only one element at a time.

Option Explicit

Sub Demo()

    Dim lastrow As Long, lastcol As Long
    Dim c As Long, r As Long, d As Object
    Dim f As String, el As String, ar
    Set d = CreateObject("Scripting.Dictionary")
    
    With Sheet1
        lastrow = .Cells(.Rows.Count, "A").End(xlUp).Row
        lastcol = .Cells(1, .Columns.Count).End(xlToLeft).Column
        ar = .Cells(1, 1).Resize(lastrow, lastcol)
        
        For r = 2 To lastrow
            f = ar(r, 1)
            Call parse(d, f)
            For c = 2 To lastcol
               el = ar(1, c)
               If d.exists(el) Then
                   ar(r, c) = d(el)
               End If
            Next
            d.RemoveAll
        Next
        .Cells(1, 1).Resize(lastrow, lastcol) = ar
    End With
    
    MsgBox "Done"
    
End Sub

Sub parse(ByRef dict, s As String)
  
    Dim regex As Object
    Set regex = CreateObject("VBScript.RegExp")
    With regex
        .Global = False
        .Pattern = "[(]([^)] )[)](\d )"
    End With
    
    ' expand bracket into multiple entries
    Dim m, matches, sm, n As Long, el As String
    Do While regex.test(s)
        Set m = regex.Execute(s)
        For n = 1 To m(0).submatches(1)
            s = s & " " & m(0).submatches(0)
        Next
        s = regex.Replace(s, "")
    Loop
    
    ' count elements
    regex.Pattern = "(" & Symbols & ")([0-9]*)"
    regex.Global = True
    If regex.test(s) Then
        Set matches = regex.Execute(s)
        For Each m In matches
            el = m.submatches(0)
            n = Val(m.submatches(1))
            If n = 0 Then n = 1
            dict(el) = dict(el)   n
        Next
    End If
    
End Sub

Function Symbols() As String

   Symbols = "A[cglmrstu]|" & _
        "B[aehikr]?|" & _
        "C[adeflmnorsu]?|" & _
        "D[bsy]|" & _
        "E[rsu]|" & _
        "F[elmr]?|" & _
        "G[ade]|" & _
        "H[efgos]?|" & _
        "I[nr]?|" & _
        "K[r]?|" & _
        "L[airuv]|" & _
        "M[cdgnot]|" & _
        "N[abdehiop]?|" & _
        "O[gs]?|" & _
        "P[abdmortu]?|" & _
        "R[abefghnu]|" & _
        "S[bcegimnr]?|" & _
        "T[abcehilms]|" & _
        "[UVW]|" & _
        "X[e]|" & _
        "Y[b]?|" & _
        "Z[nr]"

End Function

CodePudding user response:

Looks like you adapted your code to reuse the RegExp object as I suggested last time, and indeed I'd expect that to improve performance considerably. However I should have explained better how to implement but see the example below.

In the example I also took your second RegExp pattern but reworked the rest. This example seems to work for me with your sample data but that's all I've tested!

Option Explicit    
Private regEx As RegExp
Private regEx2 As RegExp

Sub Test()
' formulas in A2:A7 and elements in B1:H1 (see OP's screenshot), return results in B2:H7
    Range("B2:H7").Value = CountElements(Range("A2:A7").Value, Range("B1:H1"))
End Sub

Function CountElements(ChemFormulaRange As Variant, ElementRange As Variant) As Variant
Dim RetValRange() As Long
Dim RetVal As Long
Dim ChemFormula As String
Dim i As Long, j As Long
Dim mpoints As Long, npoints As Long
Dim Element As String

    If regEx Is Nothing Then
        Set regEx = New RegExp
        With regEx
            .Global = True
            '.MultiLine = True ' ? only if working with multilines
            .IgnoreCase = False
            
            'first pattern matches every element once
            .Pattern = "([A][cglmrstu]|[B][aehikr]?|[C][adeflmnorsu]?|[D][bsy]|[E][rsu]|[F][elmr]?|[G][ade]|[H][efgos]?|[I][nr]?|[K][r]?|[L][airuv]|[M][cdgnot]|[N][abdehiop]?|[O][gs]?|[P][abdmortu]?|[R][abefghnu]|[S][bcegimnr]?|[T][abcehilms]|[U]|[V]|[W]|[X][e]|[Y][b]?|[Z][nr])([0-9]*)"
        End With
        
        Set regEx2 = New RegExp
        With regEx2
            .Global = True
            '.MultiLine = True ?
            .IgnoreCase = False

            'second patternd finds parenthesis and multiplies elements within
            .Pattern = "(\((. ?)\)([0-9]) ) ?"
        End With
    End If


    ' Convert input ranges to variant arrays
    If TypeName(ChemFormulaRange) = "Range" Then ChemFormulaRange = ChemFormulaRange.Value
    If TypeName(ElementRange) = "Range" Then ElementRange = ElementRange.Value

    'parameter
    npoints = UBound(ChemFormulaRange, 1) - LBound(ChemFormulaRange, 1)   1
    mpoints = UBound(ElementRange, 2) - LBound(ElementRange, 2)   1

    'dimension arrays
    ReDim RetValRange(1 To npoints, 1 To mpoints)

    'calculate all values
    For i = 1 To npoints
        ChemFormula = ChemFormulaRange(i, 1)
        For j = 1 To mpoints
            RetVal = 0
            Element = ElementRange(1, j)
            Call ChemRegex(ChemFormula, Element, RetVal)
            RetValRange(i, j) = RetVal
        Next
    Next

    'output answer
    CountElements = RetValRange
    
   ' Set regEx = Nothing: Set regEx2 = Nothing

End Function

Private Sub ChemRegex(ChemFormula, Element, RetVal)
Dim Matches As MatchCollection, Matches2 As MatchCollection
Dim m As Match, m2 As Match
    
    Set Matches = regEx.Execute(ChemFormula)
    For Each m In Matches
        If m.SubMatches(0) = Element Then
            RetVal = RetVal   IIf(Not m.SubMatches(1) = vbNullString, m.SubMatches(1), 1)
        End If
    Next m

    If InStr(1, ChemFormula, "(") Then ' if the formula includes elements within parentheses
        Set Matches2 = regEx2.Execute(ChemFormula)
        For Each m2 In Matches2
            Set Matches = regEx.Execute(m2.Value)
            For Each m In Matches
                If m.SubMatches(0) = Element Then
                    If m.SubMatches(1) = vbNullString Then
                        RetVal = RetVal   m2.SubMatches(2) - 1
                    Else
                        RetVal = RetVal   m.SubMatches(1) * (m2.SubMatches(2) - 1)
                    End If
                End If
            Next
        Next m2
    End If

End Sub

This could certainly be improved a little more by testing all the elements in one RegExp exectution as suggested by CDP1802, but I'll leave that to you!

  • Related