I'm trying to convert a Python algorithm from this Stack Overflow answer to split a string without spaces into words to C#.
Unfortunately I don't know anything about Python so the translation is proving very difficult.
The lines I don't understand are:
wordcost = dict((k, log((i 1)*log(len(words)))) for i,k in enumerate(words)) <= THIS LINE
and
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c wordcost.get(s[i-k-1:i], 9e999), k 1) for k,c in candidates) <= THIS LINE
It looks as though best_match(i)
it should return a Tuple<>
. Could someone please explain what is the equivalent in C#?
Here is the full Python script:
from math import log
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
words = open("words-by-frequency.txt").read().split()
wordcost = dict((k, log((i 1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)
def infer_spaces(s):
"""Uses dynamic programming to infer the location of spaces in a string
without spaces."""
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c wordcost.get(s[i-k-1:i], 9e999), k 1) for k,c in candidates)
# Build the cost array.
cost = [0]
for i in range(1,len(s) 1):
c,k = best_match(i)
cost.append(c)
# Backtrack to recover the minimal-cost string.
out = []
i = len(s)
while i>0:
c,k = best_match(i)
assert c == cost[i]
out.append(s[i-k:i])
i -= k
return " ".join(reversed(out))
CodePudding user response:
I found that algorithm interesting so here is my translation:
class WordSplitter {
private readonly Dictionary<string, double> _wordCosts;
private readonly int _maxWordLength;
public WordSplitter(string freqFilePath) {
// words = open("words-by-frequency.txt").read().split()
var words = File.ReadAllLines(freqFilePath);
// wordcost = dict((k, log((i 1)*log(len(words)))) for i,k in enumerate(words))
_wordCosts = words.Select((k, i) => new { Key = k, Value = Math.Log((i 1) * Math.Log(words.Length)) }).ToDictionary(c => c.Key, c => c.Value);
// maxword = max(len(x) for x in words)
_maxWordLength = words.Select(c => c.Length).Max();
}
public string InferSpaces(string target) {
// cost = [0]
var costs = new List<double>() { 0 };
foreach (var i in Enumerable.Range(1, target.Length)) {
var (c, k) = BestMatch(i);
costs.Add(c);
}
var output = new List<string>();
int len = target.Length;
while (len > 0) {
var (c, k) = BestMatch(len);
Debug.Assert(k > 0);
Debug.Assert(c == costs[len]);
// use Substring if your compiler version doesn't support slicing
// but pay attention that Substring second argument is length, not end index
output.Add(target[(len - k)..len]);
len -= k;
}
output.Reverse();
return String.Join(" ", output);
(double cost, int length) BestMatch(int i) {
var start = Math.Max(0, i - _maxWordLength);
// GetRange second argument is length
var x = costs.GetRange(start, i - start);
x.Reverse();
// now, this part is easier to comprehend if it's expanded a bit
// you can do it in cryptic way too like in python though if you like
(double cost, int length)? result = null;
for (int k = 0; k < x.Count; k ) {
var c = x[k];
var sub = target[(i - k - 1)..i];
var cost = c (_wordCosts.ContainsKey(sub) ? _wordCosts[sub] : 9e99); // 9e99 is just some big number. 9e999 is outside of double range in C#, so use smaller one
// save minimal cost
if (result == null || result.Value.cost > cost)
result = (cost, k 1);
}
// return minimal cost
return result.Value;
}
}
}
Usage:
var splitter = new WordSplitter(@"C:\tmp\words.txt");
var result = splitter.InferSpaces("thumbgreenappleactiveassignmentweeklymetaphor");