Home > OS >  Delete multidimensional hash in a loop
Delete multidimensional hash in a loop

Time:08-29

I'm doing a hierarchical clustering and I need to remove the clustered elements before moving on to the next step. I did the code for a single-dimensional hash and it ran fine. Now I have a two-dimensional hash, I'm unable to delete the elements.

use strict;
use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my @array = ('A1', 'A2', 'A3');

my %distances;
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors  = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            $distances{$key_1}{$key_2} = $deviation_vectors ** 0.5 unless $distances{$key_2}{$key_1};
        };
    };
};
my @values;
while (my ($key, $element) = each %distances) {
    while (my ($element, $value) = each %{$element}) {
        push @values, $value;
    };
};
my $min = (sort {$a <=> $b} @values)[0];
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors  = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            if ($min == $deviation_vectors ** 0.5) {
                my $new_key = "$key_1,$key_2";
                foreach (@array) {
                    $hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_});
                };
# Problem here
# Delete doesn't completely remove the element, it returns a hash with an empty key element
                delete $hash{$key_1};
                delete $hash{$key_2};
            };
        };
    };
};
print Dumper \%hash;

sub mean {
    my @data = @_;
    my $sum;
    foreach (@data) {
        $sum  = $_;
    };
    return ($sum / @data)
};

This is the result I got...

$VAR1 = {
          'S4' => {},
          'S2' => {},
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S2,S4' => {
                       'A2' => 7,
                       'A1' => '6.5',
                       'A3' => '3.5'
                     },
          'S1' => {
                    'A3' => 5,
                    'A1' => 10,
                    'A2' => 11
                  },
          'S5' => {
                    'A3' => 10,
                    'A1' => 3,
                    'A2' => 2
                  }
        };

'S2' and 'S4' need to be completely removed from the hash.

CodePudding user response:

Please inspect following code which is based on provided code with some modification to remove excessive loop cycles with introduction of two indexes.

Perhaps hash %distances in this algorithm is excessive, it is kept for demonstration purpose only as it can be useful to OP.

NOTE: the code is provided for an demonstration purpose in an attempt to improve code readability

INFO: $distance ** 0.5 is better written as sqrt($distance), documentation sqrt

use strict;
use warnings;
use feature 'say';

use Data::Dumper;

my %hash = (
    'S1' => {
        'A1' => 10,
        'A2' => 11,
        'A3' => 5,
    },
    'S2' => {
        'A1' => 6,
        'A2' => 8,
        'A3' => 3,
    },
    'S3' => {
        'A1' => 20,
        'A2' => 21,
        'A3' => 15,
    },
    'S4' => {
        'A1' => 7,
        'A2' => 6,
        'A3' => 4,
    },
    'S5' => {
        'A1' => 3,
        'A2' => 2,
        'A3' => 10,
    },
);

my(%distances, $deviation, @array, @keys);

@array = qw(A1 A2 A3);
@keys  = sort keys %hash;

for my $index_1 (0..$#keys) {
    for my $index_2 (1 $index_1..$#keys) {
        my($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
        
        $distance  = ( $hash{$key_1}{$_} - $hash{$key_2}{$_} ) ** 2 for @array;
        $distance = $distance ** 0.5;

        $distances{$key_1}{$key_2} = $distance;

        $deviation->{min} = $distance unless $deviation->{min};
         
        if( $deviation->{min} > $distance ) {
            $deviation->{min} = $distance;
            $deviation->{keys} = [$key_1, $key_2];
        }
    }
}

my($key_1, $key_2) = $deviation->{keys}->@*;

$hash{"$key_1,$key_2"}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) for @array;

delete @hash{($key_1, $key_2)};

say Dumper(\%hash);

exit 0;

sub mean {
    my @data = @_;
    my $sum;
    
    $sum  = $_ for @data;
    
    return $sum / @data;
}

Output sample

$VAR1 = {
          'S2,S4' => {
                       'A1' => '6.5',
                       'A2' => '7',
                       'A3' => '3.5'
                     },
          'S5' => {
                    'A1' => 3,
                    'A3' => 10,
                    'A2' => 2
                  },
          'S3' => {
                    'A1' => 20,
                    'A3' => 15,
                    'A2' => 21
                  },
          'S1' => {
                    'A2' => 11,
                    'A3' => 5,
                    'A1' => 10
                  }
        };

CodePudding user response:

I found a solution but it doesn't look really good tho.

...
my @deleted;
for my $key_1 (sort keys %hash) {
    for my $key_2 (sort keys %hash) {
        if ($key_1 ne $key_2) {
            my $deviation_vectors;
            foreach (@array) {
                $deviation_vectors  = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
            };
            if ($min == $deviation_vectors ** 0.5) {
                my $new_key = "$key_1,$key_2";
                my $duplicated_key = "$key_2,$key_1";
                foreach (@array) {
                    $hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) unless $hash{$duplicated_key};
                };
                @deleted = ($key_1, $key_2);
            };
        };
    };
};
foreach (@deleted) {
    delete $hash{$_};
};
...
  • Related