I have an input file named as datafile.data
, which looks something like below:
1,2,1,1,0
1,3,1,1,0
1,1,2,2,1
2,1,2,2,1
2,3,2,3,1
1,1,2,3,2
3,1,1,4,2
2,1,3,2,2
3,3,3,1,2
2,2,3,4,2
Here the 1st 4 columns stands for 4 attribute values say A1, A2, A3, A4. And the final column stands for the class value. For this particular sample file there are 4 attributes but for some other files, there can be 'n' number of attributes but for every file, the last column will give the class values.
Now I want to convert this file to another file named as : outputfile.exp
Where the output file's 1st row looks something like below:
<Number of rows in the .data file> <Number of attributes> <Max value of A1> <Max value of A2> <Max value of A3> <Max value of A4> <(Max value of last column) 1>
And the remaining rows of the output file will be same as the data file, with just one change, that is the last column's each value will be incremented by 1.
For an example the output file for the above example will look like:
10 4 3 3 3 4 3
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3
Where the 1st row's 10 is the number of rows, 4 is the number of attributes present, (3,3,3,4) these 4 are the maximum values of attributes A1,A2,A3 and A4 and last 3 stands for the highest class value 1. And the last column's every value has been incremented by 1 as well.
Below I am attaching my try:
#include <stdio.h>
#include <string.h>
#define MAX_FILE_NAME 100
int main()
{
FILE *fp;
int count = 0; // Line counter (result)
char filename[MAX_FILE_NAME], dataToBeRead[50];
char c; // To store a character read from file
// Open the file
fp = fopen("datafile.data", "r");
// Check if file exists
if (fp == NULL)
{
printf("Could not open file %s", filename);
return 0;
}
// Extract characters from file and store in character c
for (c = getc(fp); c != EOF; c = getc(fp))
if (c == '\n') // Increment count if this character is newline
count = count 1;
fclose(fp);
printf("%d\n",count);
fp = fopen("datafile.data", "r");
if ( fp == NULL )
{
printf( "Failed to open." ) ;
}
else
{
while( fgets ( dataToBeRead, 50, fp ) != NULL )
{
printf( "%s" , dataToBeRead ) ;
}
fclose(fp) ;
}
return 0;
}
And I am getting the below output:
10
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3
Now I am unable to proceed further, as I am very new to C, please help me out.
CodePudding user response:
You really don't want to do this, since rewinding an input stream is an anti-pattern. But you can do something like:
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);
void
parse_line(const char *buf, int *max, int column_count)
{
for(int i = 0; i < column_count; i ){
char *end;
int t = strtol(buf, &end, 10);
if( t > max[i] ){
max[i] = t;
}
if( !((i < column_count - 1 && *end == ',')
|| (i == column_count - 1 && *end == '\n')) ){
fprintf(stderr, "invalid input '%c' in %s", *end, buf);
exit(1);
}
buf = end 1;
}
}
int
main(int argc, char **argv)
{
const char *path = argc > 1 ? argv[1] : "stdin";
FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
char buf[1024];
int column_count = 1;
int row_count = 1;
int *max;
/* Read first line to determine number of columns */
if( fgets(buf, sizeof buf, in) == NULL ){
fputs("Input error\n", stderr);
return 1;
}
for( const char *p = buf; *p; p ){
if( *p == ',' ){
column_count = 1;
}
}
max = xmalloc(column_count * sizeof *max);
for( int i = 0; i < column_count; i ){
max[i] = INT_MIN;
}
parse_line(buf, max, column_count);
while( fgets(buf, sizeof buf, in) != NULL ){
row_count = 1;
parse_line(buf, max, column_count);
}
if( fseek(in, 0L, SEEK_SET) ){
perror(path);
return 1;
}
printf("%d %d ", row_count, column_count - 1);
for( int i = 0; i < column_count - 1; i = 1 ){
printf("%d ", max[i]);
}
printf("%d\n", max[column_count - 1] 1);
while( fgets(buf, sizeof buf, in) != NULL ){
char *comma = strrchr(buf, ',');
if( comma == NULL ){
fprintf(stderr, "Invalid input\n");
return 1;
}
*comma = '\0';
int k = strtol(comma 1, NULL, 10);
printf("%s,%d\n", buf, k 1);
}
}
FILE *
xfopen(const char *path, const char *mode)
{
FILE *fp = path[0] != '-' || path[1] != '\0' ? fopen(path, mode) :
*mode == 'r' ? stdin : stdout;
if( fp == NULL ){
perror(path);
exit(EXIT_FAILURE);
}
return fp;
}
void *
xmalloc(size_t s)
{
void *rv = malloc(s);
if( rv == NULL ){
perror("malloc");
exit(EXIT_FAILURE);
}
return rv;
}
You can execute this as ./a.out < datafile.data > outputfile.exp
or ./a.out datafile.data > outputfile.exp
, but this will not work if you try to read from a pipe (the seek
will fail). The seek
failure and the inability to run this as a filter make this a suboptimal approach, but storing the entire file in memory also has drawbacks.
CodePudding user response:
As William Pursell has provided superb answer in C, here is an awk
alternative, although awk
is not tagged.
awk -F, -v OFS="," ' # assign input/output field separator to a comma
NR==FNR { # this block is invoked for the 1st read of the input file
for (i = 1; i <= NF; i ) { # loop over the filelds
if (max[i] == "" || max[i] < $i) max[i] = $i
# update the max values
}
nr = NR; nf = NF # store #records and #fields
next # skip following statements
}
FNR==1 { # this block is invoked just before reading he 1st line for the 2nd read of the input file
printf("%d %d ", nr, nf - 1) # print #records and #fields - 1
max[nf] # increment the max value of the last field
for (i = 1; i <= nf; i ) { # print max values
printf("%d%s", max[i], i==nf ? "\n" : " ");
}
}
{ # this block is invoked for the 2nd read
$nf # increment the value of the last field
print # print fields as csv
}
' datafile.data datafile.data # read the input file twice