Home > Software engineering >  Extract all the matched groups of a regular expression in a string
Extract all the matched groups of a regular expression in a string

Time:04-08

Is there a way to extract in one call all the matched subgroups of a string according to a regular expression.

I have a date like this:

Thu, 07 Apr 2022 15:03:32 GMT

And I created the following regexp to extract all the parts of this date:

let re =
    Str.regexp
      {|\([a-zA-Z] \), \([0-9] \) \([a-zA-Z] \) \([0-9] \) \([0-9] \):\([0-9] \):\([0-9] \).*|}

And to extract each parts I use it like this:


let parse_date date =
  let re =
    Str.regexp
      {|\([a-zA-Z] \), \([0-9] \) \([a-zA-Z] \) \([0-9] \) \([0-9] \):\([0-9] \):\([0-9] \).*|}
  in
  let wday = Str.replace_first re {|\1|} date in
  let day = Str.replace_first re {|\2|} date in
  let mon = Str.replace_first re {|\3|} date in
  let year = Str.replace_first re {|\4|} date in
  let hour = Str.replace_first re {|\5|} date in
  let min = Str.replace_first re {|\6|} date in
  let sec = Str.replace_first re {|\7|} date in
  Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
    sec

If the parts were stored in an array I could easily use it like this:


let parse_date date =
  let re =
    Str.regexp
      {|\([a-zA-Z] \), \([0-9] \) \([a-zA-Z] \) \([0-9] \) \([0-9] \):\([0-9] \):\([0-9] \).*|}
  in
  let parts = Str.match_groups re date in (* this function doesn't exist *)
  let wday = parts.(1) in
  let day = parts.(2) in
  let mon = parts.(3) in
  let year = parts.(4) in
  let hour = parts.(5) in
  let min = parts.(6) in
  let sec = parts.(7) in
  Format.eprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min
    sec

but this doesn't appear to exist. Is there another way to do it or is my solution the only one available?

Since this isn't a XY problem, my goal is really to extract each part of a date so maybe there's another solution than using Str and I'll be happy to use it.

CodePudding user response:

You can use Str.matched_group to return a particular capture group's match:

let parse_date date =
  let re = Str.regexp
      {|\([a-zA-Z] \), \([0-9] \) \([a-zA-Z] \) \([0-9] \) \([0-9] \):\([0-9] \):\([0-9] \).*|} in
  if Str.string_match re date 0 then
    let wday = Str.matched_group 1 date in
    let day = Str.matched_group 2 date in
    let mon = Str.matched_group 3 date in
    let year = Str.matched_group 4 date in
    let hour = Str.matched_group 5 date in
    let min = Str.matched_group 6 date in
    let sec = Str.matched_group 7 date in
    Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
  else
    "RE DATE: Not matched"

let _ = parse_date "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline

The Str package is pretty primitive, though. I'd suggest using a different library for regular expressions, like PCRE-Ocaml. It does have a way to get an array of matched groups:

let parse_date2 date =
  let rex = Pcre.regexp
      {|([a-zA-Z] ), ([0-9] ) ([a-zA-Z] ) ([0-9] ) ([0-9] ):([0-9] ):([0-9] ).*|} in
  try
    let parts = Pcre.exec ~rex date |> Pcre.get_substrings in
    let wday = parts.(1) in
    let day = parts.(2) in
    let mon = parts.(3) in
    let year = parts.(4) in
    let hour = parts.(5) in
    let min = parts.(6) in
    let sec = parts.(7) in
    Format.sprintf "RE DATE: %s %s %s %s %s %s %s@." wday day mon year hour min sec
  with Not_found -> "RE DATE: Not matched"

let _ = parse_date2 "Thu, 07 Apr 2022 15:03:32 GMT" |> print_endline

CodePudding user response:

For simple format with fixed number of fields and separators, Scanf might be enough:

let date s = Scanf.sscanf s "%s@, d %s %d %d:%d:%d %s" 
  (fun day_name day month year h m s timezone ->
    day_name,day,month,year,h,m,s,timezone
  )

let x = date "Thu, 07 Apr 2022 15:03:32 GMT"
  • Related