Home > OS >  How to sort a large file >10Gb consisting of words?
How to sort a large file >10Gb consisting of words?

Time:07-26

I have been given this interview question:

Given an input file with words (may be a random set of letters) separated by commas, the file weighs 10 GB (or more).
Provide an algorithm to sort all these words.
Assume you have 1 GB RAM.
All this needs to be done natively using NodeJS.

My analysis:

  1. First of all, we need to read the file given to us;
    const readStream = fs.createReadStream('randomWords.csv', { highWaterMark: CHUNK_SIZE });

  2. Split it into sorted chunks
    (words can be truncated, the truncated word is transferred to the next chunk file);

  3. The name of each chunk file will be its first word. Exp.:

    Some chunk file: aaeknocvxg, aaocurnro, aaoudf, ..., zzqooe, zzwm.
    It turns out that the name will be aaeknocvxg_0.csv.

    ..._0 - chunk index (to avoid collisions)

  4. We read the entire list of chunk file names, sort them, write the first word of the array into the final file.

  5. We rename the chunk file that was found to the next word from its file, we also delete the word from the file. Repeat the process until all files are empty.

  6. The resulting file is sorted!

Implementation:

const fs = require('fs');

const CHUNK_FOLDER_NAME = './chunks/';

const readStream = fs.createReadStream('randomWords.csv', { highWaterMark: 64 * 1024 });

const customSort = () => {
  let unprocesed = '';
  let chunkName = '';
  let i = 0;

  readStream.on('data', chunk => {
    const chunkString = unprocesed   chunk.toString();
    unprocesed = '';

    const chunkArray = chunkString.split(', ');
    unprocesed = chunkArray.pop();
    const sortedChunkArray = chunkArray.sort();

    chunkName = `${CHUNK_FOLDER_NAME}${sortedChunkArray[0]}_${  i}.csv`;

    fs.writeFileSync(chunkName, sortedChunkArray.join(', '));
  });

  readStream.on('end', () => {
    if (unprocesed) {
      let lastChunk = fs.readFileSync(chunkName);

      const chunkString = lastChunk.toString();

      const chunkArray = chunkString.split(', ');
      chunkArray.push(unprocesed);

      const sortedChunkArray = chunkArray.sort();

      fs.unlinkSync(chunkName);

      fs.writeFileSync(
        `${CHUNK_FOLDER_NAME}${sortedChunkArray[0]}_${i}.csv`,
        sortedChunkArray.join(', '),
      );
    }

    let hasFiles = true;

    while (hasFiles) {
      let fileNameList = [];

      fs.readdirSync(CHUNK_FOLDER_NAME).forEach(file => {
        fileNameList.push(file);
      });

      if (fileNameList.length === 0) {
        hasFiles = false;
        break;
      }

      let smallestName = fileNameList.sort()[0];

      const [word, extension] = smallestName.split('_');

      fs.appendFileSync('sortedWords.csv', `${word}, `);

      let smallestData = fs.readFileSync(CHUNK_FOLDER_NAME   smallestName);

      fs.unlinkSync(CHUNK_FOLDER_NAME   smallestName);

      const smallestString = smallestData.toString();

      if (smallestString) {
        const smallestArray = smallestString.split(', ');

        const slicedSmallestArray = smallestArray.slice(1);

        if (slicedSmallestArray.length !== 0) {
          fs.writeFileSync(
            `${CHUNK_FOLDER_NAME}${slicedSmallestArray[0]}_${extension}`,
            slicedSmallestArray.join(', '),
          );
        }
      }
    }
  });
};

exports.customSort = customSort;



Question:

The algorithm is obviously not good, how would you do it using merge sort?

CodePudding user response:

A simple merge/(divide & conquer) sort solution would be:

  1. Sort smaller batches (i.e. 300MB or even 1GB) in internal memory with the algorithm of your choice and save them in a file (sorted batch).
  2. Merge two sorted batches into a larger one by splitting the RAM into 3 (output, batch1, batch2). If the output file is full, write to the external file, in case one of the batches is empty, read more from the corresponding external-batch file. Continue with 2. until there is only one single sorted file.

CodePudding user response:

First split the files to 26 files or less, each for first letter. Then assumingly they will be less than 1GB, sort them each. Then combine. for brevity we assume lower case letters.

const fs = require('fs');

const CHUNK_FOLDER_NAME = './chunks/';
const options = { highWaterMark: 10 }       // or any size
const readStream = fs.createReadStream('randomWords.csv', options);
const alpha = Array.from(Array(26)).map((e, i) => i   65);
const alphabet = alpha.map((x) => String.fromCharCode(x).toLowerCase());

const customSort = () => {

    prepare();

    let last = '';

    readStream.on('data', chunk => {
        const chunkString = last   chunk.toString();
        const chunkArray = chunkString.split(',');
        last = chunkArray.pop();
        chunkArray.forEach(function (word) {
            word = word.trim();
            var letter = word.substring(0, 1);
            fs.appendFileSync(CHUNK_FOLDER_NAME   letter   "/letter-"   letter   ".csv", word   ",");
        })
    });

    readStream.on('end', () => {
        if (last) {
            var word = last;
            word = word.trim();         
            var letter = last.trim().substring(0, 1);
            fs.appendFileSync(CHUNK_FOLDER_NAME   letter   "/letter-"   letter   ".csv", word   ",");
        }

        // now sort every letter file (for clarity we sort it with \n as separator)
        alphabet.forEach(function (letter) {
            if (fs.existsSync(CHUNK_FOLDER_NAME   letter   "/letter-"   letter   '.csv')) {
                var content = fs.readFileSync(CHUNK_FOLDER_NAME   letter   "/letter-"   letter   '.csv');
                content = content.toString().split(",").sort().join("\n").trim()   (letter != 'z' ? "\n" : "")
                fs.appendFileSync("./final-sorted.txt", content);
            }
        })

        console.log("done")
    });
};


function prepare() {
    // create empty a,b,c folders
    alphabet.forEach(function (letter) {
        if (!fs.existsSync(CHUNK_FOLDER_NAME   letter)) {
            fs.mkdirSync(CHUNK_FOLDER_NAME   letter)
        }
        if (fs.existsSync(CHUNK_FOLDER_NAME   letter   "/letter-"   letter   '.csv')) {
            fs.unlinkSync(CHUNK_FOLDER_NAME   letter   "/letter-"   letter   '.csv');
        }
    })
    if (fs.existsSync("./final-sorted.txt")) {
        fs.unlinkSync("./final-sorted.txt");
    }
}

customSort();

CodePudding user response:

  1. Create 10 chunks of 1GB each. Sort each chunk individually using merge sort (or any other algorithm for that matter).
  2. Create a pointer to each chunk, to have a total of 10 pointers. Create a pointer which points to the output file.
  3. Pick the pointer that points towards the minimum value/character/string. Write that value to the output pointer, and increment the source pointer.
  4. Continue this process till all characters/chunks aren't covered.

This won't exceed the memory limit at any point, since the maximum data that we'll store in the memory at a single point of time will be the chunk size, which is equal to 1GB in this case.

CodePudding user response:

Here is pseudocode for an approach I've used for a harder variation of this problem in real life.

First create a wordBuffer class which supports the following API:

// Construct it.
wb = new WordBuffer()
// Tell it to be backed by a file, instead of an array.
// If the file is nonempty, it will assume finished writing and that
// the size is the size of the file.
wb.setFileName(file)
// Add a word.  Can only call if haven't finished writing.
wb.add(word)
// Mark done writing.
wb.finishWriting()
// How much is in it?
wb.size
// What file is it backed by?  May be undefined.
wb.filename
// Gets the next word and advances.  May give undefined.
wb.getWord()

With these we can easily define a merge function as follows.

function merge (in1, in2) {
    let out = new WordBuffer();
    if (in1.size   in2.size > 10000000) {
        wb.setFileName(`buffer/${counter}.buf`);
        counter  ;
    }
    let word1 = in1.getWord();
    let word2 = in2.getWord();
    while ((word1 != undefined) && (word2 != undefined)) {
        if (word1 < word2) {
            out.add(word1);
            word1 = in1.getWord();
        }
        else {
            out.add(word2);
            word2 = in2.getWord();
        }
    }
    while (word1 != undefined) {
        out.add(word1);
        word1 = in1.getWord();
    }
    while (word2 != undefined) {
        out.add(word2);
        word1 = in2.getWord();
    }
    out.finishWriting();
    return out;
}

And now for the cascading merge logic.

let chunks = [];
let inWB = new WordBuffer();
inWB.setFileName(inputFile);
let word = inWB.getWord();
while (word != undefined) {
    tmpWB = new WordBuffer();
    tmpWB.add(word);
    tmpWB.finishWriting();
    chunks.push(tmpWB);
    while ((1 < chunks.length) && (chunks[-2].size * 0.8 < chunks[-1].size)) {
        chunks.push(merge(chunks.pop(), chunks.pop()));
    }
}
while (1 < chunks.length) {
    chunks.push(merge(chunks.pop(), chunks.pop()));
}

// And now we can copy from chunks[0] - either read and write,
// or just move the underlying file to where we want the sorted output.

Note that when I solved this, I found that keeping the temporary files compressed was worth it. The CPU overhead was less than I gained in file I/O. That was some years ago though, so YMMV. I also didn't write this to be async, that would take some thought. And finally, replacing the first few levels of mergesort with the built in library sort would probably be a significant performance boost. (It was for me.)

  • Related