Home > OS >  PowerQuery: Function to get Duplicates info for given Columnnames
PowerQuery: Function to get Duplicates info for given Columnnames

Time:12-08

I need Some function in PowerQuery to get Additional Columns for duplicated data (not just keep/remove duplicates)

Example: enter image description here

For the given table I want to get following info for duplicated columns set {"Product", "Color"}:

  1. Minimal RowId - basicaly, Id of the 1st occurence of data
  2. Nr. of Duplicate - duplicates counter

CodePudding user response:

try grouping then expanding in powerquery

let Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content],
#"Grouped Rows" = Table.Group(Source, {"Product", "Color"}, {
    {"data", each Table.AddIndexColumn(_, "nDupl", 0, 1, Int64.Type), type table},
    {"MinRowID", each List.Min(_[RowId]), type number}
    }),
#"Expanded data" = Table.ExpandTableColumn(#"Grouped Rows", "data", {"RowId", "Date", "amount", "nDupl"}, {"RowId", "Date", "amount", "nDupl"})
in  #"Expanded data"

enter image description here

CodePudding user response:

Please try following function (download):

Function call Example:

tfnAddDuplicatesInfo(Source,{"Product","Color"},"nDupl" ,"MinRowId","RowId")

let 
    func = (
                 srcTable            as          table,      // input Table
                 inGroupBy           as           list,      // List of ColumnNames to search duplicates 
        optional outDuplCounter      as nullable  text,      // ColumnName for added column with Group Occurance counter (0 based)
        optional outMinGroupRowId    as nullable  text,      // ColumnName for added column with Minimal RowId within group
        optional inRowId             as nullable  text       // RowId ColumnName - required for outMinGroupRowId calculation
    ) =>
    let 
        AddMinGroupRowId = if outMinGroupRowId <> null then tfnAddDuplMinRowId(srcTable        , inGroupBy, inRowId, outMinGroupRowId) else srcTable,
        AddDuplCounter   = if outDuplCounter   <> null then tfnAddDuplCounter (AddMinGroupRowId, inGroupBy,          outDuplCounter  ) else AddMinGroupRowId,
        Result = AddDuplCounter
    in 
        Result,

    tfnAddDuplMinRowId = (srcTable as table, inGroupBy as list, inRowId as text, outMinGroupRowId as text)=>
    let
        Source = srcTable,
        srcColumns = Table.ColumnNames(Source),        
        Columns2Expand = List.Difference(srcColumns,inGroupBy),
        srcTableType = Type.TableRow(
            Value.Type(
                Table.SelectColumns(Source,Columns2Expand)
            )
        ),
        RowIdType =Type.TableColumn(Value.Type(Source),inRowId),
        #"Grouped Rows" = Table.Group(
            Source, 
            inGroupBy, 
            {
            //  {outMinGroupRowId & "0", each List.Min(Table.Column(_,inRowId)), type number},  //probably, Table.Column works slower than Record.Field            
                {outMinGroupRowId      , each List.Min(Record.Field(_,inRowId)),  RowIdType },
                {"__AllRows__"   , each _                                , type table srcTableType}
            }
        ),
        #"Expanded __AllRows__" = Table.ExpandTableColumn(#"Grouped Rows", "__AllRows__", Columns2Expand),
        Result_tfnAddDuplMinRowId = Table.ReorderColumns(#"Expanded __AllRows__", List.Combine({srcColumns,{outMinGroupRowId}}))
    in
        Result_tfnAddDuplMinRowId,
        
    tfnAddDuplCounter = (MyTable as table, GroupByColumns as list, outDuplCounter as text) =>
    let
        Source = MyTable,

        //---------- RunningTotalFunction ----------
        fxRunningTotal = ( RT_ColumnName as text, RT_Table as table) =>
        let
          Source = RT_Table,      
          RowsCount = Table.RowCount(RT_Table),
          RunningTotal = 
            List.Generate ( () => 0,
                each _ < RowsCount,
                each _  1
            ),
          #"Combined Table   RT" = 
            Table.FromColumns( 
                Table.ToColumns( RT_Table )   
                    & { Value.ReplaceType( RunningTotal, type {Int64.Type} ) }, 
                Table.ColumnNames( RT_Table ) & { RT_ColumnName } )
        in
          #"Combined Table   RT",
        //---------- END RunningTotalFunction ----------


        // Applying function to GroupBy Operation      
        #"Grouped Rows" = 
          Table.Group(
              Source, 
              GroupByColumns, 
              { {"Details", each fxRunningTotal ( outDuplCounter, _), type table } }
          ),
        #"Removed Other Columns" = 
          Table.SelectColumns(#"Grouped Rows",{"Details"}),
        #"Expanded Details" = 
          Table.ExpandTableColumn(#"Removed Other Columns", "Details", 
              Table.ColumnNames( Source ) & { outDuplCounter }, 
              Table.ColumnNames( Source ) & { outDuplCounter } ),
        RestoreDatatypes = Value.ReplaceType( 
            #"Expanded Details", 
            Value.Type(  // Creates dummy column to retrieve data type 
                Table.AddColumn( 
                    Source, 
                    outDuplCounter, 
                    each null, 
                    Int64.Type
                ) 
            )
        ) 
    in
        RestoreDatatypes,
        
    documentation = [
        Documentation.Name =  " tfnAddDuplicatesInfo ",
        Documentation.Description     = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
        Documentation.LongDescription = " Adds two info columns for Duplicates - 1st occurence RowId and given group Occurence Number",
        Documentation.Category = " Running Total ",
        Documentation.Source = "  ",
        Documentation.Version = " 1.0 ",
        Documentation.Author = " Denis Sipchenko ",
        Documentation.Examples = {
            [
                Description =  "tfnAddDuplicatesInfo arguments: ",
                Code = "
         srcTable            as          table,      // input Table
         inGroupBy           as           list,      // List of ColumnNames to search duplicates 
optional outDuplCounter      as nullable  text,      // ColumnName for added column with Group Occurance counter (0 based)
optional outMinGroupRowId    as nullable  text,      // ColumnName for added column with Minimal RowId within group
optional inRowId             as nullable  text       // RowId ColumnName - required for outMinGroupRowId calculation            ",
           Result = "
Adds two info columns for Duplicates: 
    outMinGroupRowId    - Minimal RowId within within given group
    outDuplCounter      - given group Occurence Number
"
            ],
            [
                Description =  "tfnAddDuplicatesInfo function call example  ",
                Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93 w8n2/uZWwBml07NfwVTIS nN PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
    tfnAddDuplicatesInfo(Source,{""Product"",""Color""},""nDupl"" ,""MinRowId"",""RowId"")
",
                Result = "Adds to Source table two columns:
  ""MinRowId"" - Minimal RowId within within given group,
  ""nDupl""    - given group Occurence Number
"
            ],
            [
                Description =  "tfnAddDuplicatesInfo function short call example  ",
                Code = "
let Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(""hZTBasMwEET/xWdDdteSbP9CT4U2h2JyCK1oQ0xS3IT8frUpWsmSqpxs4ccw2pn1NDXYtA3CBsYNAZE7PNn96cc93 w8n2/uZWwBml07NfwVTIS nN PK1SDZzuW1RG7PX3Y5Wb3y4r3uHKHDgrSz9fle7buRQ2e1e5EpuA4sORZw x/NgIvtnu2jbGP42G5rMS73sMDw0MdlhuODKua68Ai8KT7CH49fH5dVqOOaI6QoO5DCX1PkeraKDTnSKquLdNDjhGLvgMtsE6NZHUKrEnrVBPuU8/F0El6jRykox UlSR45DCJamEGmODhhpERGNOa5BeNaErrna0NSU3ovpJjXVpqQip1LcGLbZSVJJ1OMLsjBtcm/Y8Ux43BCwcKxa0s0UPqPC84/hV89ws="", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [RowId = Int64.Type, Date = date, Product = _t, Color = _t, Amount = Currency.Type])
in
    tfnAddDuplicatesInfo(Source,{""Product"",""Color""},""nDupl"")
",
                Result = "Adds to Source table one column:
  ""nDupl""    - given group Occurence Number
"
            ]
        }
    ]
in  
    Value.ReplaceType(func, Value.ReplaceMetadata(Value.Type(func), documentation))

P.S. It's a rework from Running Total by Category by Rick de Groot

  • Related