Home > Software design >  Perl 5.34.0 regular expression for word matching language Hebrew
Perl 5.34.0 regular expression for word matching language Hebrew

Time:03-25

I am using Perl 5.34.0 and I want to find wether an input is only hebrew letters and some signs like the question mark, etc. Even though I found a solution with a lot of overhead, I would like to learn how to do it simpler with regular expression.

This is my solution without regular expression.

First I defined the Hebrew characters in a constant hash in a Perl module.

#!perl
package  Enums::Nikudletters;

use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();

our $VERSION = 1.0;

# Types
use constant LETTER => "LETTER";
use constant LOWER_PUNKTATION => "LOWER_PUNKTATION";
use constant UPPER_PUNKTATION => "UPPER_PUNKTATION";
use constant MIDDLE_PUNKTATION => "MIDDLE_PUNKTATION";

use constant { 
NIKUDLETTERS => {
AIN => {
    UTF8 => charnames::string_vianame("U 05E2"),
    CODE => " 05E2",
    NAME => "ain",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
ALEF => {
    UTF8 => charnames::string_vianame("U 05D0"),
    CODE => " 05D0",
    NAME => "alef",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
CHET => {
    UTF8 => charnames::string_vianame("U 05D7"),
    CODE => " 05D7",
    NAME => "chet",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
DALET => {
    UTF8 => charnames::string_vianame("U 05D3"),
    CODE => " 05D3",
    NAME => "dalet",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
GIMEL => {
    UTF8 => charnames::string_vianame("U 05D2"),
    CODE => " 05D2",
    NAME => "gimel",
    TYPE => LETTER,
    WIDTH => 11,
    HANDWRITING => 1,
},
GERESCH => {
    UTF8 => charnames::string_vianame("U 05F3"),
    CODE => " 05F3",
    NAME => "geresch",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 0,
},
GERSCHAYIM => {
    UTF8 => charnames::string_vianame("U 05F4"),
    CODE => " 05F4",
    NAME => "gerschayim",
    TYPE => LETTER,
    WIDTH => 14,
    HANDWRITING => 0,
},
HAEI => {
    UTF8 => charnames::string_vianame("U 05D4"),
    CODE => " 05D4",
    NAME => "häi",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
JOD => {
    UTF8 => charnames::string_vianame("U 05D9"),
    CODE => " 05D9",
    NAME => "jod",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 1,
},
KUF => {
    UTF8 => charnames::string_vianame("U 05E7"),
    CODE => " 05E7",
    NAME => "kuf",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
LAMED => {
    UTF8 => charnames::string_vianame("U 05DC"),
    CODE => " 05DC",
    NAME => "lamed",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 0,
},
RESCH => {
    UTF8 => charnames::string_vianame("U 05E8"),
    CODE => " 05E8",
    NAME => "resch",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
SSAIN => {
    UTF8 => charnames::string_vianame("U 05D6"),
    CODE => " 05D6",
    NAME => "ssain",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 1,
},
SCHIN => {
    UTF8 => charnames::string_vianame("U 05E9"),
    CODE => " 05E9",
    NAME => "schin",
    TYPE => LETTER,
    WIDTH => 19,
    HANDWRITING => 1,
},
SSAMECH => {
    UTF8 => charnames::string_vianame("U 05E1"),
    CODE => " 05E1",
    NAME => "ssamech",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
SPACE => {
    UTF8 => charnames::string_vianame("U 0020"),
    CODE => " 0020",
    NAME => "space",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 0,
},
NEWSPACE => {
    UTF8 => charnames::string_vianame("U 00A0"),
    CODE => " 00A0",
    NAME => "newspace",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 0,
},
TAW => {
    UTF8 => charnames::string_vianame("U 05EA"),
    CODE => " 05EA",
    NAME => "taw",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
TET => {
    UTF8 => charnames::string_vianame("U 05D8"),
    CODE => " 05D8",
    NAME => "tet",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
BET => {
    UTF8 => charnames::string_vianame("U 05D1"),
    CODE => " 05D1",
    NAME => "bet",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
WAW => {
    UTF8 => charnames::string_vianame("U 05D5"),
    CODE => " 05D5",
    NAME => "waw",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 1,
},
ZADI => {
    UTF8 => charnames::string_vianame("U 05E6"),
    CODE => " 05E6",
    NAME => "zadi",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
ZADISSOFIT => {
    UTF8 => charnames::string_vianame("U 05E5"),
    CODE => " 05E5",
    NAME => "zadissofit",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
KAF => {
    UTF8 => charnames::string_vianame("U 05DB"),
    CODE => " 05DB",
    NAME => "kaf",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
CHAFSSOFIT => {
    UTF8 => charnames::string_vianame("U 05DA"),
    CODE => " 05DA",
    NAME => "chafssofit",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
PAEI => {
    UTF8 => charnames::string_vianame("U 05E4"),
    CODE => " 05E4",
    NAME => "päi",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
FAEISSOFIT => {
    UTF8 => charnames::string_vianame("U 05E3"),
    CODE => " 05E3",
    NAME => "fäissofit",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
MEM => {
    UTF8 => charnames::string_vianame("U 05DE"),
    CODE => " 05DE",
    NAME => "mem",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
MEMSSOFIT => {
    UTF8 => charnames::string_vianame("U 05DD"),
    CODE => " 05DD",
    NAME => "memssofit",
    TYPE => LETTER,
    WIDTH => 16,
    HANDWRITING => 1,
},
NUN => {
    UTF8 => charnames::string_vianame("U 05E0"),
    CODE => " 05E0",
    NAME => "nun",,
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 1,
},
NUNSSOFIT => {
    UTF8 => charnames::string_vianame("U 05DF"),
    CODE => " 05DF",
    NAME => "nunssofit",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 1,
},
SHEVA => {
    UTF8 => charnames::string_vianame("U 05B0"),
    CODE => " 05B0",
    NAME => "schwa = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
}, 
HATAF_SEGOL=> {
    UTF8 => charnames::string_vianame("U 05B1"),
    CODE => " 05B1",
    NAME => "chataf szegol = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HATAF_PATAH => {
    UTF8 => charnames::string_vianame("U 05B2"),
    CODE => " 05B2",
    NAME => "chataf patach = a",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HATAF_QAMATS => {
    UTF8 => charnames::string_vianame("U 05B3"),
    CODE => " 05B3",
    NAME => "chataf kamatz = o",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HIRIQ => {
    UTF8 => charnames::string_vianame("U 05B4"),
    CODE => " 05B4",
    NAME => "chirik = i",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
TSERE => {
    UTF8 => charnames::string_vianame("U 05B5"),
    CODE => " 05B5",
    NAME => "zeré = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
SEGOL => {
    UTF8 => charnames::string_vianame("U 05B6"),
    CODE => " 05B6",
    NAME => "szegol = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
PATAH => {
    UTF8 => charnames::string_vianame("U 05B7"),
    CODE => " 05B7",
    NAME => "patach = a",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
QAMATS => {
    UTF8 => charnames::string_vianame("U 05B8"),
    CODE => " 05B8",
    NAME => "kamatz = a",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HOLAM => {
    UTF8 => charnames::string_vianame("U 05B9"),
    CODE => " 05B9",
    NAME => "cholam = o",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HOLAM_HASER => {
    UTF8 => charnames::string_vianame("U 05BA"),
    CODE => " 05BA",
    NAME => "cholam chaser",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
QUBUTS => {
    UTF8 => charnames::string_vianame("U 05BB"),
    CODE => " 05BB",
    NAME => "kubutz = u",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
DAGESH => {
    UTF8 => charnames::string_vianame("U 05BC"),
    CODE => " 05BC",
    NAME => "dagesch / schuruk",
    TYPE => MIDDLE_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
}, 
METEG => {
    UTF8 => charnames::string_vianame("U 05BD"),
    CODE => " 05BD",
    NAME => "meteg",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
MAQAF => {
    UTF8 => charnames::string_vianame("U 05BE"),
    CODE => " 05BE",
    NAME => "makaf",
    TYPE => LETTER,
    WIDTH => 14,
    HANDWRITING => 0,
},
RAFE => {
    UTF8 => charnames::string_vianame("U 05BF"),
    CODE => " 05BF",
    NAME => "rafi",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
PASEQ => {
    UTF8 => charnames::string_vianame("U 05C0"),
    CODE => " 05C0",
    NAME => "pasek",
    TYPE => LETTER,
    WIDTH => 4,
    HANDWRITING => 0,
},
SHIN_DOT => {
    UTF8 => charnames::string_vianame("U 05C1"),
    CODE => " 05C1",
    NAME => "schin Punkt",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
SIN_DOT => {
    UTF8 => charnames::string_vianame("U 05C2"),
    CODE => " 05C2",
    NAME => "ssin Punkt",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
}, 
SOF_PASUQ => {
    UTF8 => charnames::string_vianame("U 05C3"),
    CODE => " 05C3",
    NAME => "sof pasuk",
    TYPE => LETTER,
    WIDTH => 8,
    HANDWRITING => 0,
},
UPPER_DOT => {
    UTF8 => charnames::string_vianame("U 05C4"),
    CODE => " 05C4",
    NAME => "oberer Punkt",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
LOWER_DOT => {
    UTF8 => charnames::string_vianame("U 05C5"),
    CODE => " 05C5",
    NAME => "unterer Punkt",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HAFUKAH => {
    UTF8 => charnames::string_vianame("U 05C6"),
    CODE => " 05C6",
    NAME => "chafukach",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 0,
},
QAMATS_QATAN => {
    UTF8 => charnames::string_vianame("U 05C7"),
    CODE => " 05C7",
    NAME => "kamatz katan",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
JIDDISH_DOUBLE_WAW => {
    UTF8 => charnames::string_vianame("U 05F0"),
    CODE => " 05F0",
    NAME => "waw waw",
    TYPE => LETTER,
    WIDTH => 19,
    HANDWRITING => 0,
},
JIDDISH_WAW_JOD => {
    UTF8 => charnames::string_vianame("U 05F1"),
    CODE => " 05F1",
    NAME => "waw jod",
    TYPE => LETTER,
    WIDTH => 20,
    HANDWRITING => 0,
},
JIDDISH_DOUBLE_JOD => {
    UTF8 => charnames::string_vianame("U 05F2"),
    CODE => " 05F2",
    NAME => "jod jod",
    TYPE => LETTER,
    WIDTH => 21,
    HANDWRITING => 0,
},
}};


sub get {
    return NIKUDLETTERS;
}

sub get_regular_expression_string
{
    my %letter_hash = %{ Enums::Nikudletters->NIKUDLETTERS };
    my @values_sorted;
    my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=>  $letter_hash{$b}->{CODE} } keys (%letter_hash);
    foreach my $key (@keys_sorted)
    {
        push(@values_sorted, $letter_hash{$key}->{CODE});
    }
    return join('', @values_sorted);
}

1;

I also defined some sign-letters the same way:

#!perl
package  Enums::Signletters;

use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();

our $VERSION = 1.0;

use constant SIGNLETTERS => {

# ?
QUESTION_MARK =>
{
    CODE    => " 003F",        
},
# !
EXCLAMATION_MARK =>
    {
        CODE => " 0021",            
    },
# .
FULL_STOP=>
    {
        CODE => " 002E",            
    },
# '
APOSTROPHE =>
    {
        CODE => " 0027",           
    },
# (
LEFT_PARENTHESIS =>
    {
        CODE => " 0028",            
    },
# )
RIGHT_PARENTHESIS =>
    {
        CODE => " 0029",           
    },
# ,
COMMA =>
    {
        CODE => " 002C",            
    },
# -
HYPHEN_MINUS =>
    {
        CODE => " 002D",            
    },
# "
QUOTATION_MARK =>
    {
        CODE => " 0022",           
    },
# §
SECTION_SIGN =>
    {
        CODE => " 00A7",            
    },
# $
DOLLAR_SIGN =>
    {
        CODE => " 0024",            
    },
# €
EURO_SIGN =>
    {
        CODE => " 20AC",            
    },
# %
PERCENT_SIGN =>
    {
        CODE => " 0025",           
    },
    # /
SOLIDUS =>
    {
        CODE => " 002F",            
    },
    #[
LEFT_SQUARE_BRACKET =>
    {
        CODE => " 005B",            
    },
    # ]
RIGHT_SQUARE_BRACKET =>
    {
        CODE => " 005D",            
    },
    # {
LEFT_CURLY_BRACKET =>
    {
        CODE => " 007B",            
    },
#
}
RIGHT_CURLY_BRACKET =>
    {
        CODE => " 007D",           
    },
    # =
EQUALS_SIGN =>
    {
        CODE => " 003D",          
    },
    # \ 
REVERSE_SOLIDUS =>
    {
        CODE => " 005C",            
    },
    # *
ASTERISK =>
    {
        CODE => " 002A",            
    },
     #  
PLUS_SIGN =>
    {
        CODE => " 002B",            
    },
     # #
NUMBER_SIGN =>
    {
        CODE => " 0023",            
    },
     # ;
SEMICOLON =>
    {
        CODE => " 0023",            
    },
     # :
COLON =>
    {
        CODE => " 003A",            
    },
# _
LOW_LINE =>
    {
        CODE => " 005F",           
    },
# °
DEGREE_SIGN =>
    {
        CODE => " 00B0",            
    },
# ^
CIRCUMFLEX_ACCENT =>
    {
        CODE => " 005E",           
    },
# ´
ACUTE_ACCENT =>
    {
        CODE => " 00B4",            
    },
# `
GRAVE_ACCENT =>
    {
        CODE => " 0060",           
    },
# @
COMMERCIAL_AT =>
    {
        CODE => " 0040",            
    },
# µ
MICRO_SIGN =>
    {
        CODE => " 00B5",           
    },
# <
LESS_THAN_SIGN =>
    {
        CODE => " 003C",           
    },
# >
GREATER_THAN_SIGN =>
    {
        CODE => " 003E",            
    },
# |
VERTICAL_LINE =>
    {
        CODE => " 007C",            
    },
# &
AMPERSAND =>
    {
        CODE => " 0026",          
    },
};

sub get_regular_expression_string
{
my %letter_hash = %{ Enums::Signletters->SIGNLETTERS };
my @values_sorted;
my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=>  $letter_hash{$b}->{CODE} } keys (%letter_hash);
foreach my $key (@keys_sorted)
{
    push(@values_sorted, $letter_hash{$key}->{CODE}) unless( $key eq "REVERSE_SOLIDUS");
}
return join('', @values_sorted);
}



1;

Then I can join the CODEs into any expression String I want. Like this:

#!perl
package Helpers::UnicodeChecker;

use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;

use Enums::Nikudletters;
use Enums::Signletters;

our $VERSION = 1.0;

 sub is_valid_hebrew ($self, $hebrew)
 {
 my $expression = Enums::Nikudletters->get_regular_expression_string();
 my $expression_ascii = Enums::Signletters->get_regular_expression_string();

 my @hebrew_letters = split('', $hebrew);

foreach my $letter (@hebrew_letters)
{
    my $number = unpack("W*", $letter);
    unless( $number > 255)
    {
        my $code_ascii = get_ascii_code($number);
        return "false" unless($expression_ascii =~ $code_ascii);
    }
   else
    {
        my $code = get_wide_code($number);
        return "false" unless($expression =~ $code);
    }
}
return "true";
}

sub get_wide_code
{
return substr( sprintf("\\x{X}", $_[0]) , 3, 4 );
}

sub get_ascii_code
{
return "00".substr( sprintf("\\xX", $_[0]) , 2, 2 );
}

1;

This works but there is a lot of code involved. Would be nice to have a short regular expression to accomplish the same task. Can anyone provide the regular expression?

I googled a lot and read a lot and tried a lot but I can not find the regular expression that works with Perl 5.34.0 Thank you any help is appreciated. I am just learing Perl.

CodePudding user response:

You can use a relatively simple pattern match to do this.

The interesting bit here is the \p{Hebrew}, which allows you to match every character with a specific Unicode property. The rest is just beginning ^ and end $ of string, and a quantifier to say one or more.

use strict;
use warnings;
use utf8;       # for the Hebrew glyphs in my example input

my $string = qq{שלום עולם!}; 
print $string =~ m/^[\p{Hebrew}?!.' ] $/;

This will match any Hebrew letters, spaces and a couple of punctuation characters. You do not need the utf8 pragma unless you want to include actual Hebrew text in your source code, such as in comments.

You can extend the character group (in []) by any other characters you want.

In my comment I used the re pragma, which is useful for debugging regular expressions.

  • Related