Read CSV File
Read values from a csv file regardless of format
One of the challenges regardless of programming language is reading needed information from data files. What frequently happends is a new routine is written each time the problem is encountered, customized for the format of the file holding the data. In almost all circumstances, the correct approach is line-oriented input. Reading an entire line at a time using either getline
or fgets
[1]. However, there is one circumstance where reading a file one character-at-a-time provides an advantage (and reusable code). When reading numeric values, the rookie choice is to try and craft a format string for fscanf
to match the data in the file -- wrong approach.
Don't get me wrong, in certain circumstances fscanf
provides a good solution, but more times than not, any slight variation in format of the data file, or stray character, will bring this approach crashing down. (not to mention the stray newline
characters that are invariably left in the input buffer by the inexperienced.) Line oriented input again is a good solution, reading a line of data into a buffer, then using sscanf
(or simple pointers) to parse the needed information. But again, the parse of the buffer must be customized with each change in the datafie
In this situation, character-input can allow a routine to the written to parse a file for numeric values that will work regardless of the input file format. Even if the file creator is a nut and decides to separate each numeric value with ‘--’
or ‘,;*a_’
or decides to put 10 newline
characters between the values, a character input strategy can still work. The folowing is an example of this approach. It will read a file of numeric values into a dynamically allocated array of long integer
s. Space for 64 long integers
is initially allocated, and the array is reallocated as needed. While this is just an example using getchar()
to read values from stdin
within main()
, it is trivial to covert into a function that accepts a filename and pointer to an index, and then to simply pass the filename to the function, along with a pointer to an index and have the function return the filled array, and updated index containing the number of values in the array. Using fgetc
, you can easily have the function read from either a file or stdin
. Here is the example, give it any csv file of numeric values and test it:
Example Program:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> #include <errno.h> #define MAXL 32 #define NMAX 64 long xstrtol (char *p, char **ep, int base); long *realloc_long (long *lp, unsigned long *n); void prn_stats (long *a, size_t n); int main (void) { long *array = NULL; /* ptr to array of long */ char buffer[MAXL] = {0}; /* buffer for number string */ char *ep = buffer; /* end ptr for strtol */ size_t idx = 0; /* array index / counter */ size_t bidx = 0; /* buffer index / counter */ unsigned long nmax = NMAX; /* reallocation counter */ int c = 0; /* variable for getchar() */ /* allocate array of NMAX long using calloc to initialize to 0 */ if (!(array = calloc (NMAX, sizeof *array))) { fprintf (stderr, "error: memory allocation failed."); return 1; } /* read each value from csv file into array */ while ((c = getchar()) != EOF) { if (c == '-') { /* if sign character */ buffer[bidx++] = c; /* store, read next c */ if ((c = getchar()) == EOF) break; /* if EOF, done */ if (c < '0' || c > '9') /* if c not a digit */ { bidx = 0; continue; } /* reset bidx, continue */ } while (c >= '0' && c <= '9') { /* while c is a digit */ buffer[bidx++] = c; /* add c to buffer */ if ((c = getchar()) == EOF) break; /* read next char */ } if (bidx) { /* if chars in buffer */ buffer[bidx] = 0; /* null-terminate */ array[idx] = xstrtol (buffer, &ep, 10); /* convert to long */ if (errno == 0) idx++; /* if OK, increment idx */ if (idx == nmax) /* check idx == nmax */ array = realloc_long (array, &nmax); /* realloc if required */ bidx = 0; /* reset bidx for next */ } } prn_stats (array, idx); /* output min/max/avg.. */ free (array); /* free allocated mem */ return 0; } /* reallocate array of long values, increase to 2 * '*n'. * returns pointer to newly allocated zeroed block of memory * on success, otherwise program exits. value at 'n' is * updated to reflect the new allocation size. */ long *realloc_long (long *lp, unsigned long *n) { long *tmp = realloc (lp, 2 * *n * sizeof *lp); #ifdef DEBUG printf (" reallocating %lu to %lu\n", *n, *n * 2); #endif if (!tmp) { fprintf (stderr, "%s() error: reallocation failed.\n", __func__); // return NULL; exit (EXIT_FAILURE); } lp = tmp; memset (lp + *n, 0, *n * sizeof *lp); /* memset new ptrs 0 */ *n *= 2; return lp; } /* a simple strtol implementation with error checking. * any failed conversion will cause program exit. Adjust * response to failed conversion as required. */ long xstrtol (char *p, char **ep, int base) { errno = 0; long tmp = strtol (p, ep, base); /* Check for various possible errors */ if ((errno == ERANGE && (tmp == LONG_MIN || tmp == LONG_MAX)) || (errno != 0 && tmp == 0)) { perror ("strtol"); exit (EXIT_FAILURE); } if (*ep == p) { fprintf (stderr, "No digits were found\n"); exit (EXIT_FAILURE); } return tmp; } /* simple output routine for min, max, avg, total and array values * note: the array index output is formatted to 2 places below, * adjust as necessary to meet your needs. */ void prn_stats (long *a, size_t n) { long min = (long)(INT_MAX); long max = (long)(-INT_MAX - 1); long avg = 0; long total = 0; size_t i = 0; /* find min, max, iompute total & average & output */ for (i = 0; i < n; i++) { min = a[i] < min ? a[i] : min; max = a[i] > max ? a[i] : max; total += a[i]; } avg = n > 0 ? total/n : 0; printf("\nvalues : %ld\n", n); printf("minimum : %ld\n", min); printf("maximum : %ld\n", max); printf("total : %ld\n", total); printf("average : %ld\n\n", avg); int sf = n > 50 ? n / 50 : 1; /* scale factor to limit to 50 lines */ for (i = 0; i < n; i+=sf) printf (" a[%2zu] : %ld\n", i, a[i]); printf ("\n"); }
Compile with:
gcc -Wall -Wextra -o readlong_csv readlong_csv.c
Sample Input Files:
Each of the following files contain 10 integer values. (note: only numbers immediately preceded by an ‘-’
are considered negative values, any intervening non-digit characters between the ‘-’
and first digit, resets the value search preventing invalid sign conversion)
A worst case scenario mess:
8572,;a -2213,;--a 6434,; a- 16330,;a - The Quick Brown%3034 Fox 12346Jumps Over A 4855,;*;Lazy 16985/,;a Dog. 11250 1495
Simple comma
or space
separated values:
8572,-2213,6434,16330,3034,12346,4855,16985,11250,1495
8572 -2213 6434 16330 3034 12346 4855 16985 11250 1495
Simple newline
separated values:
8572 -2213 6434 16330 3034 12346 4855 16985 11250 1495
Use / Output:
$ ./bin/read_csv_long <../dat/10intmess.txt values : 10 minimum : -2213 maximum : 16985 total : 79088 average : 7908 a[ 0] : 8572 a[ 1] : -2213 a[ 2] : 6434 a[ 3] : 16330 a[ 4] : 3034 a[ 5] : 12346 a[ 6] : 4855 a[ 7] : 16985 a[ 8] : 11250 a[ 9] : 1495