I have two different text files and I have to find 10 longest words that are in both of them. I have to print the list of those words out and write the frequency - how many times they are repeated in those separate files. The problem I have with my current code is that it finds the words, but when it comes to frequency - it combines the frequency count. How can I change the code to know the frequency count for separate files?
Here is my code for finding words that are in both text files:
public static Dictionary<string, int> PopularWords(string data1, string data2, char[] punctuation)
{
string[] book1 = data1.Split(punctuation, StringSplitOptions.RemoveEmptyEntries);
string[] book2 = data2.Split(punctuation, StringSplitOptions.RemoveEmptyEntries);
Dictionary<string, int> matches = new Dictionary<string, int>();
for (int i = 0; i < book1.Length; i )
{
if (matches.ContainsKey(book1[i]))
{
matches[book1[i]] ;
continue;
}
for (int j = 0; j < book2.Length; j )
{
if (book1[i] == book2[j])
{
if (matches.ContainsKey(book1[i]))
{
matches[book1[i]] ;
} else
{
matches.Add(book1[i], 2);
}
}
}
}
return matches;
And here is my code for reading and printing:
public static void ProcessPopular(string data, string data1, string results)
{
char[] punctuation = { ' ', '.', ',', '!', '?', ':', ';', '(', ')', '\n' };
string lines = File.ReadAllText(data, Encoding.UTF8);
string lines2 = File.ReadAllText(data1, Encoding.UTF8);
var popular = PopularWords(lines, lines2, punctuation);
KeyValuePair<string, int>[] popularWords = popular.ToArray();
Array.Sort(popularWords, (x, y) => y.Key.Length.CompareTo(x.Key.Length));
using (var writerF = File.CreateText(results))
{
int foundWords = 0;
writerF.WriteLine("{0, -25} | {1, -35} | {2, -35}", "Longest words", "Frequency in 1 .txt file", "Frequency in 2 .txt file");
writerF.WriteLine(new string('-', 101));
// not finished
}
}
CodePudding user response:
To simplify, if performance is not the key here, I would go this way:
public static void Method()
{
var a = "A deep blue raffle, very deep and blue, raffle raffle. An old one was there";
var b = "deep blue raffle, very very very long and blue, raffle RAFFLE. A new one was there";
char[] punctuation = { '.', ',', '!', '?', ':', ';', '(', ')', '\n' };
var fileOne = new string(a.Where(c => punctuation.Contains(c) is false).ToArray()).Split(" ");
var fileTwo = new string(b.Where(c => punctuation.Contains(c) is false).ToArray()).Split(" ");
var duplicates = fileOne.Intersect(fileTwo, StringComparer.OrdinalIgnoreCase);
var result = new List<(int, int, string)>(duplicates.Count());
foreach(var duplicat in duplicates)
{
result.Add((fileOne.Count(x => x.Equals(duplicat, StringComparison.OrdinalIgnoreCase)), fileTwo.Count(x => x.Equals(duplicat, StringComparison.OrdinalIgnoreCase)), duplicat));
}
foreach (var val in result)
{
Output.WriteLine($"Word: {val.Item3} | In file one: {val.Item1} | In file two: {val.Item2}");
}
}
This will give you the result of
Word: A | In file one: 1 | In file two: 1
Word: deep | In file one: 2 | In file two: 1
Word: blue | In file one: 2 | In file two: 2
Word: raffle | In file one: 3 | In file two: 3
Word: very | In file one: 1 | In file two: 3
Word: and | In file one: 1 | In file two: 1
Word: one | In file one: 1 | In file two: 1
Word: was | In file one: 1 | In file two: 1
Word: there | In file one: 1 | In file two: 1
Other requirements, like only 10 words or minimum word length etc can be easily applied.
Please do mind that this a bare-bone example, without any safety checks. It also omits reading data from files.
CodePudding user response:
In case you would be curious about a different coding style, here is a solution using Linq.
This solution is more general: it works whatever number of files you wish to process. This is an extremely raw query that could be separated in smaller queries, but it gives the logical basis.
This is a Linqpad query that you can run directly via copy/paste, but you need to provide the text files of course:
// Choose here how many different words you want.
var resultCount = 10;
// Add as many files as needed.
var Files = new List<string>
{
@"C:\Temp\FileA.txt",
@"C:\Temp\FileB.txt",
@"C:\Temp\FileC.txt",
};
// Perform the calculation.
var LongestCommonWords = Files
.Select(f => new // Get the text from the file alongside the filename
{
FileName = f,
Text = File.ReadAllText(f)
})
.Select(f => new
{ // Add the ponctuations marks found in the text (useful later for removing them).
f.FileName,
f.Text,
Punctuation = f.Text
.Where(Char.IsPunctuation)
.Distinct()
.ToArray()
})
.SelectMany(f => f.Text.Split() // Split each text into words.
.Select(w => w.Trim(f.Punctuation)) // Remove the punctuation.
.Select(w => new
{
Word = w.ToLower(), // Associate each word with its file.
f.FileName
})
.GroupBy(w => w.Word) // Group by word.
.Select(g => new // Create a new object containing the word,
{ // its number of occurence, and the file it comes from.
Word = g.Key,
FileOccurence = g.Count(),
FileName = g.First().FileName
})
) // Since it was a SelectMany, now all words of all texts are in a single list
.GroupBy(w => w.Word) // Group by word
.Select(g => new // Create a new object that contains the word,
{ // and the number of occurences by file, and the files were it was found
Word = g.Key,
OccurencesByFile = g
.Select(e => new
{
e.FileOccurence,
e.FileName
})
.ToList(),
FileNames = g
.Select(e => e.FileName)
.Distinct()
.ToList()
})
.Where(w => w.FileNames.Count == Files.Count) // Only keep words that were found in all files
.OrderByDescending(w => w.Word.Length) // Order then by lenght descending
.Take(resultCount); // Take only the desired amount (10 for instance)
// Display the results.
foreach (var word in LongestCommonWords)
{
var occurences = string.Join(
" / ",
word.OccurencesByFile
.Select(f => $"{Path.GetFileName(f.FileName)} - {f.FileOccurence}")
);
Console.WriteLine($"{word.Word} - {occurences}");
}
}
Here is an output obtained with the content of three Wikipedia pages:
contribution - FileA.txt - 9 / FileB.txt - 1 / FileC.txt - 5
subsequently - FileA.txt - 2 / FileB.txt - 1 / FileC.txt - 1
introduction - FileA.txt - 1 / FileB.txt - 4 / FileC.txt - 3
alternative - FileA.txt - 2 / FileB.txt - 1 / FileC.txt - 1
independent - FileA.txt - 5 / FileB.txt - 3 / FileC.txt - 3
significant - FileA.txt - 2 / FileB.txt - 1 / FileC.txt - 3
established - FileA.txt - 1 / FileB.txt - 1 / FileC.txt - 1
outstanding - FileA.txt - 1 / FileB.txt - 3 / FileC.txt - 3
programming - FileA.txt - 1 / FileB.txt - 2 / FileC.txt - 4
university - FileA.txt - 44 / FileB.txt - 17 / FileC.txt - 7