I am trying to search for repeating byte patterns in a very big binary file (always bigger than 4Gb). The idea is to read X bytes (patternSize) based on an increasing offset in each iteration and then search from offset patternSize the rest of the file for the same byte pattern. All of this using threads.
This is the code I've tried:
using System.Threading;
namespace pattern_search
{
public class ThreadData
{
public FileStream Stream { get; set; }
public byte[] SearchPattern { get; set; }
public int PatternOffset { get; set; }
public int PatternSize { get; set; }
public ThreadData(FileStream stream, byte[] searchPattern, int patternOffset, int patternSize)
{
Stream = stream;
SearchPattern = searchPattern;
PatternOffset = patternOffset;
PatternSize = patternSize;
}
}
internal class Program
{
static void CompareTask(Object stateInfo)
{
ThreadData threadData = (ThreadData)stateInfo;
long len = threadData.Stream.Length;
for (int searchOffset = threadData.PatternOffset threadData.PatternSize; searchOffset < (len - threadData.PatternSize); searchOffset )
{
byte[] comparePattern = new byte[threadData.PatternSize];
threadData.Stream.Seek(searchOffset, SeekOrigin.Begin);
int readBytes = threadData.Stream.Read(comparePattern, 0, threadData.PatternSize); // Randomly throws exception
if (comparePattern == threadData.SearchPattern)
{
Console.WriteLine("Pattern: {0} Search: {1}", threadData.PatternOffset, searchOffset);
}
}
}
static void Main(string[] args)
{
int patternSize = 8;
String inputFilename = @"F:\Dev\log\data.bin";
FileStream fs = new(inputFilename, FileMode.Open);
long len = fs.Length;
Console.WriteLine(len);
for (int patternOffset = 0; patternOffset < (len - patternSize); patternOffset )
{
byte[] searchPattern = new byte[patternSize];
fs.Seek(patternOffset, SeekOrigin.Begin);
int readBytes = fs.Read(searchPattern, 0, patternSize); // Randomly throws exception
ThreadData data = new ThreadData(fs, searchPattern, patternOffset, patternSize);
ThreadPool.QueueUserWorkItem(CompareTask, data);
}
}
}
}
Every time I run this, the following exception is thrown at random offsets in the lines where I read the bytes from the file and I don't know why:
System.ArgumentOutOfRangeException: 'Specified argument was out of the range of valid values.'
Does anyone see what I'm doing wrong?
EDIT
- Very big file > 4Gb
- Using locks makes it very slow
- Using one FileStream (with FileShare.Read) for each Thread solves the problem
CodePudding user response:
Stream are not thread safe, to fix this using current algorithm you should use lock...
static void CompareTask(Object stateInfo)
{
//...
int readBytes;
lock (threadData.Stream)
{
threadData.Stream.Seek(searchOffset, SeekOrigin.Begin);
readBytes = threadData.Stream.Read(comparePattern, 0, threadData.PatternSize); // Randomly throws exception
}
//...
}
static void Main(string[] args)
{
//...
int readBytes;
lock (fs)
{
fs.Seek(patternOffset, SeekOrigin.Begin);
readBytes = fs.Read(searchPattern, 0, patternSize); // Randomly throws exception
}
//..
}
of course it would not be efficient now, if possible, load the entire file into memory and do operation directly on the buffer, at not lock cost.