I need Some function in PowerQuery
to get Additional Columns for duplicated
data (not just keep/remove duplicates)
For the given table I want to get following info for duplicated columns set {"Product", "Color"}
:
- Minimal RowId - basicaly, Id of the 1st occurence of data
- Nr. of Duplicate - duplicates counter
CodePudding user response:
try grouping then expanding in powerquery
let Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content],
#"Grouped Rows" = Table.Group(Source, {"Product", "Color"}, {
{"data", each Table.AddIndexColumn(_, "nDupl", 0, 1, Int64.Type), type table},
{"MinRowID", each List.Min(_[RowId]), type number}
}),
#"Expanded data" = Table.ExpandTableColumn(#"Grouped Rows", "data", {"RowId", "Date", "amount", "nDupl"}, {"RowId", "Date", "amount", "nDupl"})
in #"Expanded data"
CodePudding user response:
Please try following function (download):
Function call Example:
tfnAddDuplicatesInfo(Source,{"Product","Color"},"nDupl" ,"MinRowId","RowId")
let
func = (
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
optional outDuplCounter as nullable text, // ColumnName for added column with Group Occurance counter (0 based)
optional outMinGroupRowId as nullable text, // ColumnName for added column with Minimal RowId within group
optional inRowId as nullable text // RowId ColumnName - required for outMinGroupRowId calculation
) =>
let
AddMinGroupRowId = if outMinGroupRowId <> null then tfnAddDuplMinRowId(srcTable , inGroupBy, inRowId, outMinGroupRowId) else srcTable,
AddDuplCounter = if outDuplCounter <> null then tfnAddDuplCounter (AddMinGroupRowId, inGroupBy, outDuplCounter ) else AddMinGroupRowId,
Result = AddDuplCounter
in
Result,
tfnAddDuplMinRowId = (srcTable as table, inGroupBy as list, inRowId as text, outMinGroupRowId as text)=>
let
Source = srcTable,
srcColumns = Table.ColumnNames(Source),
Columns2Expand = List.Difference(srcColumns,inGroupBy),
srcTableType = Type.TableRow(
Value.Type(
Table.SelectColumns(Source,Columns2Expand)
)
),
RowIdType =Type.TableColumn(Value.Type(Source),inRowId),
#"Grouped Rows" = Table.Group(
Source,
inGroupBy,
{
// {outMinGroupRowId & "0", each List.Min(Table.Column(_,inRowId)), type number}, //probably, Table.Column works slower than Record.Field
{outMinGroupRowId , each List.Min(Record.Field(_,inRowId)), RowIdType },
{"__AllRows__" , each _ , type table srcTableType}
}
),
#"Expanded __AllRows__" = Table.ExpandTableColumn(#"Grouped Rows", "__AllRows__", Columns2Expand),
Result_tfnAddDuplMinRowId = Table.ReorderColumns(#"Expanded __AllRows__", List.Combine({srcColumns,{outMinGroupRowId}}))
in
Result_tfnAddDuplMinRowId,
tfnAddDuplCounter = (MyTable as table, GroupByColumns as list, outDuplCounter as text) =>
let
Source = MyTable,
//---------- RunningTotalFunction ----------
fxRunningTotal = ( RT_ColumnName as text, RT_Table as table) =>
let
Source = RT_Table,
RowsCount = Table.RowCount(RT_Table),
RunningTotal =
List.Generate ( () => 0,
each _ < RowsCount,
each _ 1
),
#"Combined Table RT" =
Table.FromColumns(
Table.ToColumns( RT_Table )
& { Value.ReplaceType( RunningTotal, type {Int64.Type} ) },
Table.ColumnNames( RT_Table ) & { RT_ColumnName } )
in
#"Combined Table RT",
//---------- END RunningTotalFunction ----------
// Applying function to GroupBy Operation
#"Grouped Rows" =
Table.Group(
Source,
GroupByColumns,
{ {"Details", each fxRunningTotal ( outDuplCounter, _), type table } }
),
#"Removed Other Columns" =
Table.SelectColumns(#"Grouped Rows",{"Details"}),
#"Expanded Details" =
Table.ExpandTableColumn(#"Removed Other Columns", "Details",
Table.ColumnNames( Source ) & { outDuplCounter },
Table.ColumnNames( Source ) & { outDuplCounter } ),
RestoreDatatypes = Value.ReplaceType(
#"Expanded Details",
Value.Type( // Creates dummy column to retrieve data type
Table.AddColumn(
Source,
outDuplCounter,
each null,
Int64.Type
)
)
)
in
RestoreDatatypes,
documentation = [
Documentation.Name = " tfnAddDuplicatesInfo ",
Documentation.Description = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
Documentation.LongDescription = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
Documentation.Category = " Running Total ",
Documentation.Source = " ",
Documentation.Version = " 1.0 ",
Documentation.Author = " Denis Sipchenko ",
Documentation.Examples = {
[
Description = "tfnAddDuplicatesInfo arguments: ",
Code = "
srcTable as table, // input Table
inGroupBy as list, // List of ColumnNames to search duplicates
optional outDuplCounter as nullable text, // ColumnName for added column with Group Occurance counter (0 based)
optional outMinGroupRowId as nullable text, // ColumnName for added column with Minimal RowId within group
optional inRowId as nullable text // RowId ColumnName - required for outMinGroupRowId calculation ",
Result = "
Adds two info columns for Duplicates:
outMinGroupRowId - Minimal RowId within within given group
outDuplCounter - given group Occurence Number
"
],
[
Description = "tfnAddDuplicatesInfo function call example ",
Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93 w8n2/uZWwBml07NfwVTIS nN PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
tfnAddDuplicatesInfo(Source,{""Product"",""Color""},""nDupl"" ,""MinRowId"",""RowId"")
",
Result = "Adds to Source table two columns:
""MinRowId"" - Minimal RowId within within given group,
""nDupl"" - given group Occurence Number
"
],
[
Description = "tfnAddDuplicatesInfo function short call example ",
Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93 w8n2/uZWwBml07NfwVTIS nN PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
tfnAddDuplicatesInfo(Source,{""Product"",""Color""},""nDupl"")
",
Result = "Adds to Source table one column:
""nDupl"" - given group Occurence Number
"
]
}
]
in
Value.ReplaceType(func, Value.ReplaceMetadata(Value.Type(func), documentation))
P.S. It's a rework from Running Total by Category by Rick de Groot