I'm doing a hierarchical clustering and I need to remove the clustered elements before moving on to the next step. I did the code for a single-dimensional hash and it ran fine. Now I have a two-dimensional hash, I'm unable to delete the elements.
use strict;
use Data::Dumper;
my %hash = (
'S1' => {
'A1' => 10,
'A2' => 11,
'A3' => 5,
},
'S2' => {
'A1' => 6,
'A2' => 8,
'A3' => 3,
},
'S3' => {
'A1' => 20,
'A2' => 21,
'A3' => 15,
},
'S4' => {
'A1' => 7,
'A2' => 6,
'A3' => 4,
},
'S5' => {
'A1' => 3,
'A2' => 2,
'A3' => 10,
},
);
my @array = ('A1', 'A2', 'A3');
my %distances;
for my $key_1 (sort keys %hash) {
for my $key_2 (sort keys %hash) {
if ($key_1 ne $key_2) {
my $deviation_vectors;
foreach (@array) {
$deviation_vectors = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
};
$distances{$key_1}{$key_2} = $deviation_vectors ** 0.5 unless $distances{$key_2}{$key_1};
};
};
};
my @values;
while (my ($key, $element) = each %distances) {
while (my ($element, $value) = each %{$element}) {
push @values, $value;
};
};
my $min = (sort {$a <=> $b} @values)[0];
for my $key_1 (sort keys %hash) {
for my $key_2 (sort keys %hash) {
if ($key_1 ne $key_2) {
my $deviation_vectors;
foreach (@array) {
$deviation_vectors = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
};
if ($min == $deviation_vectors ** 0.5) {
my $new_key = "$key_1,$key_2";
foreach (@array) {
$hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_});
};
# Problem here
# Delete doesn't completely remove the element, it returns a hash with an empty key element
delete $hash{$key_1};
delete $hash{$key_2};
};
};
};
};
print Dumper \%hash;
sub mean {
my @data = @_;
my $sum;
foreach (@data) {
$sum = $_;
};
return ($sum / @data)
};
This is the result I got...
$VAR1 = {
'S4' => {},
'S2' => {},
'S3' => {
'A1' => 20,
'A3' => 15,
'A2' => 21
},
'S2,S4' => {
'A2' => 7,
'A1' => '6.5',
'A3' => '3.5'
},
'S1' => {
'A3' => 5,
'A1' => 10,
'A2' => 11
},
'S5' => {
'A3' => 10,
'A1' => 3,
'A2' => 2
}
};
'S2' and 'S4' need to be completely removed from the hash.
CodePudding user response:
Please inspect following code which is based on provided code with some modification to remove excessive loop cycles with introduction of two indexes.
Perhaps hash %distances
in this algorithm is excessive, it is kept for demonstration purpose only as it can be useful to OP.
NOTE: the code is provided for an demonstration purpose in an attempt to improve code readability
INFO: $distance ** 0.5
is better written as sqrt($distance)
, documentation sqrt
use strict;
use warnings;
use feature 'say';
use Data::Dumper;
my %hash = (
'S1' => {
'A1' => 10,
'A2' => 11,
'A3' => 5,
},
'S2' => {
'A1' => 6,
'A2' => 8,
'A3' => 3,
},
'S3' => {
'A1' => 20,
'A2' => 21,
'A3' => 15,
},
'S4' => {
'A1' => 7,
'A2' => 6,
'A3' => 4,
},
'S5' => {
'A1' => 3,
'A2' => 2,
'A3' => 10,
},
);
my(%distances, $deviation, @array, @keys);
@array = qw(A1 A2 A3);
@keys = sort keys %hash;
for my $index_1 (0..$#keys) {
for my $index_2 (1 $index_1..$#keys) {
my($distance, $key_1, $key_2) = (0, $keys[$index_1], $keys[$index_2]);
$distance = ( $hash{$key_1}{$_} - $hash{$key_2}{$_} ) ** 2 for @array;
$distance = $distance ** 0.5;
$distances{$key_1}{$key_2} = $distance;
$deviation->{min} = $distance unless $deviation->{min};
if( $deviation->{min} > $distance ) {
$deviation->{min} = $distance;
$deviation->{keys} = [$key_1, $key_2];
}
}
}
my($key_1, $key_2) = $deviation->{keys}->@*;
$hash{"$key_1,$key_2"}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) for @array;
delete @hash{($key_1, $key_2)};
say Dumper(\%hash);
exit 0;
sub mean {
my @data = @_;
my $sum;
$sum = $_ for @data;
return $sum / @data;
}
Output sample
$VAR1 = {
'S2,S4' => {
'A1' => '6.5',
'A2' => '7',
'A3' => '3.5'
},
'S5' => {
'A1' => 3,
'A3' => 10,
'A2' => 2
},
'S3' => {
'A1' => 20,
'A3' => 15,
'A2' => 21
},
'S1' => {
'A2' => 11,
'A3' => 5,
'A1' => 10
}
};
CodePudding user response:
I found a solution but it doesn't look really good tho.
...
my @deleted;
for my $key_1 (sort keys %hash) {
for my $key_2 (sort keys %hash) {
if ($key_1 ne $key_2) {
my $deviation_vectors;
foreach (@array) {
$deviation_vectors = ($hash{$key_1}{$_} - $hash{$key_2}{$_}) ** 2;
};
if ($min == $deviation_vectors ** 0.5) {
my $new_key = "$key_1,$key_2";
my $duplicated_key = "$key_2,$key_1";
foreach (@array) {
$hash{$new_key}{$_} = mean($hash{$key_1}{$_}, $hash{$key_2}{$_}) unless $hash{$duplicated_key};
};
@deleted = ($key_1, $key_2);
};
};
};
};
foreach (@deleted) {
delete $hash{$_};
};
...