I have an Excel macro code to extract unique mutations from GISAID metadata that involves:
- Trimming the "(" in the very beginning and the ")" in the very end of each value and auto-filling the trim formula down until the last row.
- Pasting (values only the trimmed data into a new sheet) and splitting the comma-delimited values.
- Stacking all the multi-columned rows into one column.
- Deleting all blank cells and shifting the subsequent cells up (if any blank cells are present).
- Removing duplicates.
This is the code that I've managed to build (I'm really really new in VBA, I've only started automating Excel processes because I'm working with GISAID data almost every day.) Users can paste the data from GISAID's .tsv metadata to A1 and just run the macro.
Sub MUTATIONS_MACRO()
'
Application.ScreenUpdating = False
Application.Calculation = xlCalculationManual
' MUTATIONS_MACRO_EXCEL_1 Macro
'
'
Range("B1").Select
Dim Lr As Long
Lr = Cells(Rows.Count, "A").End(xlUp).Row
Range("B1:B" & Lr).Formula = "=RIGHT((LEFT(RC[-1], LEN(RC[-1])-1)), LEN(LEFT(RC[-1], LEN(RC[-1])-1))-1)"
Range("B1").Select
Range(Selection, Selection.End(xlDown)).Select
Selection.Copy
Sheets.Add After:=ActiveSheet
Range("A1").PasteSpecial Paste:=xlPasteValues
Range("A1").Select
Range(Selection, Selection.End(xlDown)).Select
Selection.TextToColumns _
Destination:=Range("A1"), DataType:=xlDelimited, _
ConsecutiveDelimiter:=False, Comma:=True
ActiveCell.Rows("1:1").EntireRow.Select
Range(Selection, Selection.End(xlDown)).Select
Selection.SpecialCells(xlCellTypeBlanks).Select
Selection.Delete Shift:=xlUp
Range("A1").Select
Range(Selection, Selection.End(xlToRight)).Select
Range(Selection, Selection.End(xlDown)).Select
Dim vaCells As Variant
Dim vOutput() As Variant
Dim i As Long, j As Long
Dim lRow As Long
If TypeName(Selection) = "Range" Then
If Selection.Count > 1 Then
If Selection.Count <= Selection.Parent.Rows.Count Then
vaCells = Selection.Value
ReDim vOutput(1 To UBound(vaCells, 1) * UBound(vaCells, 2), 1 To 1)
For j = LBound(vaCells, 2) To UBound(vaCells, 2)
For i = LBound(vaCells, 1) To UBound(vaCells, 1)
If Len(vaCells(i, j)) > 0 Then
lRow = lRow 1
vOutput(lRow, 1) = vaCells(i, j)
End If
Next i
Next j
Selection.ClearContents
Selection.Cells(1).Resize(lRow).Value = vOutput
End If
End If
End If
Range("A1").Select
Range(Selection, Selection.End(xlDown)).Select
ActiveSheet.Range("A:A").RemoveDuplicates Columns:=1, Header:=xlNo
Application.ScreenUpdating = True
Application.Calculation = xlCalculationAutomatic
End Sub
This works perfectly for up to 1000 rows of data but if I start pasting more than 1100-ish rows onto column A, it starts to run weird and gives me results that are not in a single column. I'm not sure why it's running differently if the processes are exactly the same. Can anyone help? Thank you so much!
CodePudding user response:
Split Comma-Delimited Data to Column
Option Explicit
Sub ExtractMutations()
' Source
Const sName As String = "PASTE"
Const sFirstCellAddress As String = "A1"
Const sDelimiter As String = ","
' Destination
Const dName As String = "Mutations"
Const dFirstCellAddress As String = "A1"
Const dNameIncrementDelimiter As String = ""
' Workbook
Dim wb As Workbook: Set wb = ThisWorkbook
' Reference the source one-column range and write its values
' to a 2D one-based one-column array.
Dim sws As Worksheet: Set sws = wb.Worksheets(sName)
Dim srg As Range
Dim rCount As Long
With sws.Range(sFirstCellAddress)
Dim slCell As Range: Set slCell = .Resize(sws.Rows.Count - .Row 1) _
.Find("*", , xlFormulas, , , xlPrevious)
If slCell Is Nothing Then Exit Sub
rCount = slCell.Row - .Row 1
Set srg = .Resize(rCount)
End With
Dim sAddress As String: sAddress = srg.Address
Dim Data As Variant
' Get rid of the parentheses.
Data = sws.Evaluate("MID(" & sAddress & ",2,LEN(" & sAddress & ")-2)")
' Split the array data into a dictionary removing duplicates.
Dim dict As Object: Set dict = CreateObject("Scripting.Dictionary")
dict.CompareMode = vbTextCompare
Dim Key As Variant
Dim r As Long
For r = 1 To rCount
For Each Key In Split(Data(r, 1), sDelimiter)
If Not IsError(Key) Then
If Len(Key) > 0 Then
dict(Key) = Empty
End If
End If
Next Key
Next r
' Write the values from the dictionary to a 2D one-based one-column array.
rCount = dict.Count
ReDim Data(1 To rCount, 1 To 1)
r = 0
For Each Key In dict.Keys
r = r 1
Data(r, 1) = Key
Next Key
' Write the values from the array to the destination range.
Dim DN As String: DN = dName
r = 0
With wb.Worksheets.Add(After:=sws)
' If the destination worksheet name is taken, add an increment
Dim ErrNum As Long
Do
On Error Resume Next
.Name = DN
r = r 1: DN = dName & dNameIncrementDelimiter & r
ErrNum = Err.Number
On Error GoTo 0
Loop Until ErrNum = 0
' Write result.
With .Range(dFirstCellAddress)
.Resize(rCount).Value = Data
.EntireColumn.AutoFit
End With
End With
' Save the workbook.
'wb.Save
' Inform.
MsgBox "Mutations extracted.", vbInformation
End Sub
CodePudding user response:
@VBasic2008 beat me to it, but posting this anyway:
Sub MUTATIONS_MACRO()
Dim dict As Object, c As Range, arr, v, data, ws As Worksheet, r As Long, e
Set dict = CreateObject("scripting.dictionary")
Set ws = ActiveSheet
'get all data as an array
data = ActiveSheet.Range("A1:A" & ws.Cells(Rows.Count, 1).End(xlUp).Row).Value
For r = 1 To UBound(data, 1) 'loop over the array and process each value
v = Trim(data(r, 1)) 'read the value
If Len(v) > 2 Then 'not blank/too short?
v = Mid(v, 2, Len(v) - 2) 'remove ()
arr = Split(v, ",") 'split on comma
For Each e In arr 'loop values
dict(CStr(Trim(e))) = 1 'put in dictionary (unique only)
Next e
End If
Next r
DictKeysToSheet dict, ws.Parent.Worksheets.Add.Range("A1")
End Sub
'add a dictionary's keys to a sheet as a column starting at range `c`
Sub DictKeysToSheet(dict As Object, c As Range)
Dim arr(), keys, i As Long, r As Long
keys = dict.keys
ReDim arr(1 To dict.Count, 1 To 1)
r = 1
For i = LBound(keys) To UBound(keys)
arr(r, 1) = keys(i)
r = r 1
Next i
c.Resize(dict.Count).Value = arr
End Sub