In the following:
cat file | cut -f 1,5,6 | sort | uniq
Is intuitive to me to think that uniq
needs to know the whole dataset before proceeding.
From HERE I understand that sort
does write temporary files into disk for long sets of data.
Does uniq
writes temporary files into disk for long datasets? Where?
Thank you!
CodePudding user response:
uniq
only needs to read a line at a time and compare the current line to the previous one; it can start working as soon as it starts getting lines; no need to read all input before producing any output.
Basically, it just needs to read a line, and compare it to the previous line. If they're the same, increment a counter. If not, print the previous line (With count if requested). Then save the current line as the previous, and repeat.
Here's a bare bones version written in C you can use as an example:
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
_Bool show_count = 0;
if (argc == 2 && strcmp(argv[1], "-c") == 0) {
show_count = 1;
} else if (argc > 1) {
fprintf(stderr, "Usage: %s [-c]\n", argv[0]);
return EXIT_FAILURE;
}
char *prev = NULL;
size_t prev_len = 0;
int count = 1;
while (1) {
char *line = NULL;
size_t line_len = 0;
ssize_t len = getline(&line, &line_len, stdin);
if (len < 0) {
if (feof(stdin)) {
break;
} else {
fprintf(stderr, "Couldn't read input: %s\n", strerror(errno));
return EXIT_FAILURE;
}
} else {
if (prev) {
if (strcmp(line, prev) == 0) {
count ;
} else {
if (show_count) {
printf("} ", count);
}
fwrite(prev, 1, prev_len, stdout);
count = 1;
}
free(prev);
}
}
prev = line;
prev_len = len;
}
if (prev) {
if (show_count) {
printf("} ", count);
}
fwrite(prev, 1, prev_len, stdout);
free(prev);
}
return 0;
}