Delete multidimensional hash in a loop-CodePudding

I'm doing a hierarchical clustering and I need to remove the clustered elements before moving on to the next step. I did the code for a single-dimensional hash and it ran fine. Now I have a two-dimensional hash, I'm unable to delete the elements.

use strict;
use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my @array = ('A1', 'A2', 'A3');

my %distances;
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors  = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            $distances{$key_1}{$key_2} = $deviation_vectors ** 0.5 unless $distances{$key_2}{$key_1};
        };
    };
};
my @values;
while (my ($key, $element) = each %distances) {
    while (my ($element, $value) = each %{$element}) {
        push @values, $value;
    };
};
my $min = (sort {$a <=> $b} @values)[0];
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors  = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            if ($min == $deviation_vectors ** 0.5) {
                my $new_key = "$key_1,$key_2";
                foreach (@array) {
                    $hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_});
                };
# Problem here
# Delete doesn't completely remove the element, it returns a hash with an empty key element
                delete $hash{$key_1};
                delete $hash{$key_2};
            };
        };
    };
};
print Dumper \%hash;

sub mean {
    my @data = @_;
    my $sum;
    foreach (@data) {
        $sum  = $_;
    };
    return ($sum / @data)
};

This is the result I got...

$VAR1 = {
          'S4' => {},
          'S2' => {},
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S2,S4' => {
                       'A2' => 7,
                       'A1' => '6.5',
                       'A3' => '3.5'
                     },
          'S1' => {
                    'A3' => 5,
                    'A1' => 10,
                    'A2' => 11
                  },
          'S5' => {
                    'A3' => 10,
                    'A1' => 3,
                    'A2' => 2
                  }
        };

'S2' and 'S4' need to be completely removed from the hash.

CodePudding user response：

Please inspect following code which is based on provided code with some modification to remove excessive loop cycles with introduction of two indexes.

Perhaps hash %distances in this algorithm is excessive, it is kept for demonstration purpose only as it can be useful to OP.

NOTE: the code is provided for an demonstration purpose in an attempt to improve code readability

INFO: $distance ** 0.5 is better written as sqrt($distance), documentation sqrt

use strict;
use warnings;
use feature 'say';

use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my(%distances, $deviation, @array, @keys);

@array = qw(A1 A2 A3);
@keys  = sort keys %hash;

for my $index_1 (0..$#keys) {
    for my $index_2 (1 $index_1..$#keys) {
        my($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
        
        $distance  = ( $hash{$key_1}{$_} - $hash{$key_2}{$_} ) ** 2 for @array;
        $distance = $distance ** 0.5;

        $distances{$key_1}{$key_2} = $distance;

        $deviation->{min} = $distance unless $deviation->{min};
         
        if( $deviation->{min} > $distance ) {
            $deviation->{min} = $distance;
            $deviation->{keys} = [$key_1, $key_2];
        }
    }
}

my($key_1, $key_2) = $deviation->{keys}->@*;

$hash{"$key_1,$key_2"}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) for @array;

delete @hash{($key_1, $key_2)};

say Dumper(\%hash);

exit 0;

sub mean {
    my @data = @_;
    my $sum;
    
    $sum  = $_ for @data;
    
    return $sum / @data;
}

Output sample

$VAR1 = {
          'S2,S4' => {
                       'A1' => '6.5',
                       'A2' => '7',
                       'A3' => '3.5'
                     },
          'S5' => {
                    'A1' => 3,
                    'A3' => 10,
                    'A2' => 2
                  },
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S1' => {
                    'A2' => 11,
                    'A3' => 5,
                    'A1' => 10
                  }
        };

CodePudding user response：

I found a solution but it doesn't look really good tho.

...
my @deleted;
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors  = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            if ($min == $deviation_vectors ** 0.5) {
                my $new_key = "$key_1,$key_2";
                my $duplicated_key = "$key_2,$key_1";
                foreach (@array) {
                    $hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) unless $hash{$duplicated_key};
                };
                @deleted = ($key_1, $key_2);
            };
        };
    };
};
foreach (@deleted) {
    delete $hash{$_};
};
...