Home > Software design >  Group adjacent similar (shortest prefix) elements in string or list in Scala
Group adjacent similar (shortest prefix) elements in string or list in Scala

Time:07-28

am facing dramatic performances issues for a kind of classic problem to solve.

i have as inputs a string like :

input :
  val test: String = "1,1,12,1,1,12,13"
  val test2: String = "1,1,12,1,1,12,13,1,1,12"

Desired output:

test --> 2{1,1,12},13

test2 --> 2{1,1,12},13,2{1},12

I have took the following approach, by comparing suffix, and prefix of that given string in a recursive fashion

  def shorten_prefix(pfx:String, sfx: String): (Integer, String, String) = {
    var count = 1
    var _sfx = sfx
    while (_sfx.startsWith(pfx)) {
      count = count   1
      _sfx = _sfx.splitAt(pfx.length)._2
    }
    val _tmp =  s"$count{${pfx}}"
    (count, _tmp, _sfx)
  }
  def find_shortest_repr(input: String): String= {
    var possible_variants: mutable.ListBuffer[String] = new mutable.ListBuffer[String]()
    if (input.isEmpty) {
      return ""
    }

        val size = input.length
        for (i <- 1 until  size   1){
          val (prefixes, suffixes) = input.splitAt(i)
          val (counter, shortened, new_ending) = shorten_prefix(prefixes, suffixes)

          val shortest_ending: String = find_shortest_repr(new_ending)
          val tmp = shortened    shortest_ending
          possible_variants  = (tmp)
        }
        possible_variants.minBy(f => f.length)
      }


   def compress(x: String)= {
     val _tmp = find_shortest_repr(x)
     val regex = "(,\\})"
     val subst = "},"
     _tmp.replaceAll(regex, subst).dropRight(1)
   }

  println(compress(test))
  println(compress(test2))

}

It sounds that the longer is the string, longer is the calculation of all possible permutations, to find the best match.

Is there any tricks or advice, am missing ?

Thanks you

CodePudding user response:

Here's some code. It is functional and tail recursive, but no idea of performance or whether there are better algorithms!

// Count number of times first element repeats at start of list
// Returns repeat count
def prefixCount[T](s: List[T]): Int = {
  def loop(first: T, rem: List[T], matchCount: Int): Int =
    rem match {
      case hd :: tail if hd == first =>
        loop(first, tail, matchCount   1)
      case _ =>
        matchCount
    }

  s match {
    case hd :: tail => loop(hd, tail, 1)
    case _ => s.size
  }
}

// Find the largest repeating group at start of sequence
// Returns (groupSize, groupCount)
def prefixGroup[T](seq: Seq[T]): (Int, Int) = {
  def loop(len: Int): (Int, Int) =
    if (len == 0) {
      (1, 1)
    } else {
      val g = seq.grouped(len).toList
      val c = prefixCount(g)
      if (c > 1) {
        (len, c)
      } else {
        loop(len-1)
      }
    }

  seq.size match {
    case 0 => (0,1)
    case n => loop(n/2)
  }
}

// Find all prefix sequences
def prefixAll[T](v: Seq[T]): List[(Seq[T], Int)] = {
  def loop(rem: Seq[T], res: List[(Seq[T], Int)]): List[(Seq[T], Int)] =
    rem match {
      case Nil =>
        res.reverse
      case hd :: Nil =>
        ((rem, 1)  : res).reverse
      case _ =>
        val (groupSize, groupCount) = prefixGroup(rem)

        loop(rem.drop(groupSize*groupCount), (rem.take(groupSize), groupCount)  : res)
    }

  loop(v, Nil)
}

def prefixToString[T](p: List[(Seq[T], Int)]): String =
  p.map {
    case (s, 1) => s.mkString(",")
    case (s, l) => l.toString   "{"   s.mkString(",")   "}"
  }.mkString(",")

def test(s: String) = {
  val v = s.split(",").toVector
  val a = prefixAll(v)

  print(s"$s => ")
  println(prefixToString(a))
}

val tests = List(
  "",
  "1",
  "1,2,3,4,5,6",
  "1,1,12,1,1,12,13",
  "1,1,12,1,1,12,13,1,1,12",
  "2,1,2,2,1,2,1,2,2",
)

tests.foreach(test)

Output is:

  => 
1 => 1
1,2,3,4,5,6 => 1,2,3,4,5,6
1,1,12,1,1,12,13 => 2{1,1,12},13
1,1,12,1,1,12,13,1,1,12 => 2{1,1,12},13,2{1},12
2,1,2,2,1,2,1,2,2 => 2{2,1,2},1,2{2}

CodePudding user response:

Some ideas (some of which also can be found in Tim's solution):

  1. Split the input strings before launching the actual algorithm (assuming inputs will always be comma-separated).
  2. There is no need to search for further occurrences of a prefix that is longer than the remaining string (e.g. "1,2,3" in "1,2,3,3,3").
  3. Instead of starting your search with the shortest possible prefix (of size 1) and then continuing with longer prefixes, you might want to start with the longest possible prefix (of size input.length / 2) and then decrement. That way, you can "fast forward" as soon as you found a match.

CodePudding user response:

Here is a fp solution, you can modify it via dp method to a O(n^3) time and O(n^2) space complexity one:

Note: the recurrence relation of this problem is compress(ss, start, end) = minimize { compress(ss, start, i) compress(ss, i, end) for i in 1, 2, .., len(ss) -1}

So there are n^n program state

def compress(text: String): String = {
  def size: ((Seq[String], Int)) => Int = {
    case (ss, i) => if(i == 1) ss.map(_.size).sum   ss.size - 1 
                    else ss.map(_.size).sum   ss.size   i.toString.size   1
  }
  
  def length(res: Seq[(Seq[String], Int)]): Int = res.map(size).sum   res.size - 1

  def cmpOne(items: Seq[String]): (Seq[String], Int) = 
    (1 to items.size).collectFirst { 
      case i if items.size % i == 0 &&  items.grouped(i).toSet.size == 1 => (items.take(i), items.size / i)
    }.get
    
  def cmpAll(items: Seq[String]): Seq[(Seq[String], Int)] =  {
    val totalRes = Seq(cmpOne(items))
    if (items.size >= 2) {
      val totalLen = length(totalRes)
      val splitRes = (1 until items.size).map(i => cmpAll(items.take(i))    cmpAll(items.drop(i))).minBy(length)
      val splitLen = length(splitRes)
      if (splitLen < totalLen) splitRes else totalRes
    }
    else totalRes
  }
  
  cmpAll(text.split(",")).map{ case(ss, i) => if(i == 1) ss.mkString(",") else i.toString   "{"   ss.mkString(",")   "}" }.mkString(",")
}
  • Related