Theoretica
Mathematical Library
Loading...
Searching...
No Matches
csv.h
Go to the documentation of this file.
1
5
6#ifndef THEORETICA_IO_CSV_H
7#define THEORETICA_IO_CSV_H
8
9#include <fstream>
10#include <iomanip>
11#include <algorithm>
12
13#include "./error.h"
14#include "../algebra/vec.h"
15#include "../algebra/mat.h"
16#include "../statistics/histogram.h"
17#include "./data_table.h"
18#include "./strings.h"
19
20
21namespace theoretica {
22namespace io {
23
24
32 inline std::vector<std::string> parse_csv(const std::string& line, char delimiter = ',') {
33
34 std::vector<std::string> fields;
35 std::string field;
36 bool quoted = false;
37
38 for (size_t i = 0; i < line.length(); ++i) {
39
40 char c = line[i];
41
42 if (c == '"') {
43 quoted = !quoted;
44 } else if (c == delimiter && !quoted) {
45 fields.emplace_back(field);
46 field.clear();
47 } else if(!std::isspace(c) || quoted) {
48 field += c;
49 }
50 }
51
52 fields.emplace_back(field);
53 return fields;
54 }
55
56
62 inline std::string quote_csv(const std::string& str) {
63
64 bool has_whitespace = false;
65 for (char c : str) {
66
67 if (std::isspace(c)) {
68 has_whitespace = true;
69 break;
70 }
71 }
72
73 if (str.find(',') != std::string::npos || has_whitespace)
74 return "\"" + str + "\"";
75 else
76 return str;
77 }
78
79
85 template<typename Type, unsigned int N>
86 inline void write_csv(
87 const std::string& filename, const vec<Type, N>& v, unsigned int precision = 8) {
88
89 std::ofstream file (filename);
90 if (!file.is_open()) {
91 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
92 return;
93 }
94
95 for (size_t i = 0; i < v.size(); ++i)
96 file << std::setprecision(precision) << v[i] << std::endl;
97 }
98
99
105 template<typename Type, unsigned int N>
106 inline void write_csv(
107 const std::string& filename, const std::string& header,
108 const vec<Type, N>& v, unsigned int precision = 8) {
109
110 std::ofstream file (filename);
111 if (!file.is_open()) {
112 // TODO: throw another exception ?
113 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
114 return;
115 }
116
117 file << "\"" << header << "\"" << std::endl;
118 for (size_t i = 0; i < v.size(); ++i)
119 file << std::setprecision(precision) << v[i] << std::endl;
120 }
121
122
130 template<typename Type, unsigned int N, enable_real<Type> = true>
131 inline void read_csv(const std::string& filename, vec<Type, N>& v) {
132
133 std::ifstream file (filename);
134 std::string line;
135
136 if (!file.is_open()) {
137 TH_IO_ERROR("io::read_csv", filename, IoError::ReadError);
138 return;
139 }
140
141 // Check for header
142 if (!std::getline(file, line))
143 return;
144
146
147 // Resulting column vector
148 std::vector<real> col;
149
150 if (io::is_number(line)) {
151 try {
152 std::replace(line.begin(), line.end(), ',', '.');
153 real first = std::stod(line);
154 col.emplace_back(first);
155 } catch (const std::invalid_argument& e) {
156 // Do nothing, the entry is not a number
157 } catch(const std::out_of_range& e) {
158 TH_MATH_ERROR("io::read_csv", filename, MathError::OutOfRange);
159 }
160 }
161
162 // All remaining lines are data
163 while (std::getline(file, line)) {
164
166 std::replace(line.begin(), line.end(), ',', '.');
167
168 try {
169 real val = std::stod(line);
170 col.emplace_back(val);
171 } catch (const std::exception& e) {
172 col.emplace_back(nan());
173 }
174 }
175
176 // Handle mismatched sizes with empty values (NaN)
177 if (v.size() > col.size()) {
178
179 for (size_t i = 0; i < col.size(); i++)
180 v[i] = col[i];
181
182 for (size_t i = col.size(); i < v.size(); i++)
183 v[i] = nan();
184
185 } else {
186 algebra::vec_copy(v, col);
187 }
188 }
189
190
201 template<typename Type, unsigned int N, enable_real<Type> = true>
202 inline void read_csv(
203 const std::string& filename, const std::string& col_name,
204 vec<Type, N>& v, bool trim_nan = true) {
205
206 std::ifstream file (filename);
207 std::string line;
208
209 if (!file.is_open()) {
210 TH_IO_ERROR("io::read_csv", filename, IoError::ReadError);
212 return;
213 }
214
215 // Read header
216 if (!std::getline(file, line))
217 return;
218
219 // Find the index of the specified column
220 std::vector<std::string> headers = parse_csv(line);
221 int col_index = -1;
222 for (size_t i = 0; i < headers.size(); ++i) {
223
224 if (headers[i] == col_name) {
225 col_index = i;
226 break;
227 }
228 }
229
230 // No column was found
231 if (col_index == -1) {
232
233 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
234
235 if (!v.size())
236 v.resize(1);
237
238 for (size_t i = 0; i < v.size(); ++i)
239 v[i] = nan();
240
241 return;
242 }
243
244 // Read data from the specified column
245 std::vector<real> data;
246 std::vector<std::string> cells;
247
248 while (std::getline(file, line)) {
249
251
252 if (size_t(col_index) < cells.size()) {
253
254 std::string cell = cells[col_index];
255 std::replace(cell.begin(), cell.end(), ',', '.');
256
257 try {
258 const real val = std::stod(cell);
259 data.emplace_back(val);
260 } catch (const std::exception& e) {
261 data.emplace_back(nan());
262 }
263 } else {
264 data.emplace_back(nan());
265 }
266 }
267
268 // Trim trailing NaN values if enabled
269 size_t actual_size = data.size();
270
271 if (trim_nan)
272 while (actual_size > 0 && is_nan(data[actual_size - 1]))
273 actual_size--;
274
275 // Need to allocate space
276 if (v.size() < actual_size) {
277
279
280 if (v.size() < actual_size) {
281 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
283 return;
284 }
285 }
286
287 for (size_t i = 0; i < actual_size; i++)
288 v[i] = data[i];
289
290 for (size_t i = actual_size; i < v.size(); i++)
291 v[i] = nan();
292 }
293
294
299 template<typename Type, unsigned int N, unsigned int M>
300 inline void write_csv(
301 const std::string& filename, const mat<Type, N, M>& A,
302 const std::string& delimiter = ", ", unsigned int precision = 8) {
303
304 std::ofstream file (filename);
305
306 if (!file.is_open()) {
307 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
308 return;
309 }
310
311 for (size_t i = 0; i < A.rows(); i++) {
312 for (size_t j = 0; j < A.cols(); j++) {
313
314 file << std::setprecision(precision) << A(i, j);
315
316 if (j != A.cols() - 1)
317 file << delimiter;
318 else
319 file << std::endl;
320 }
321 }
322 }
323
324
331 template<unsigned int N, unsigned int K>
332 inline void read_csv(const std::string& filename, mat<real, N, K>& A) {
333
334 std::ifstream file (filename);
335 std::string line;
336
337 if (!file.is_open()) {
338 TH_IO_ERROR("io::read_csv", filename, IoError::FileNotFound);
339 return;
340 }
341
342 std::vector<std::vector<real>> rows;
343
344 // Read first line to check for header
345 if (!std::getline(file, line))
346 return;
347
348 std::vector<std::string> first_row = parse_csv(line);
349
350 // Check if first line is a header
351 bool has_header = false;
352
353 for (const auto& cell : first_row) {
354
355 if (!io::is_number(cell)) {
356 has_header = true;
357 break;
358 }
359 }
360
361 // If first line is not a header, process it as data
362 if (!has_header) {
363
364 std::vector<real> row;
365
366 for (auto cell : first_row) {
367
368 std::replace(cell.begin(), cell.end(), ',', '.');
369
370 try {
371 row.emplace_back(std::stod(cell));
372 } catch (const std::exception& e) {
373 row.emplace_back(nan());
374 }
375 }
376
377 if (!row.empty())
378 rows.emplace_back(row);
379 }
380
381 // Read remaining lines
382 while (std::getline(file, line)) {
383
384 // Skip empty lines
385 if (line.empty())
386 continue;
387
388 std::vector<std::string> cells = parse_csv(line);
389 std::vector<real> row;
390
391 for (auto cell : cells) {
392
393 std::replace(cell.begin(), cell.end(), ',', '.');
394
395 try {
396 row.emplace_back(std::stod(cell));
397 } catch (const std::exception& e) {
398 row.emplace_back(nan());
399 }
400 }
401
402 if (!row.empty()) {
403 rows.emplace_back(row);
404 }
405 }
406
407 A.resize(rows.size(), rows[0].size());
408
409 if (A.rows() < rows[0].size() || A.cols() < rows.size()) {
410 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
412 return;
413 }
414
415 // Fill matrix with parsed data
416 for (size_t i = 0; i < min(rows.size(), A.rows()); ++i) {
417
418 for (size_t j = 0; j < min(rows[i].size(), A.cols()); ++j)
419 A(i, j) = rows[i][j];
420
421 // Pad remaining columns with NaN
422 for (size_t j = rows[i].size(); j < A.cols(); ++j)
423 A(i, j) = nan();
424 }
425
426 // Pad remaining rows with NaN
427 for (size_t i = rows.size(); i < A.rows(); ++i)
428 for (size_t j = 0; j < A.cols(); ++j)
429 A(i, j) = nan();
430 }
431
432
437 inline void write_csv(
438 const std::string& filename, const data_table& table,
439 const std::string& delimiter = ", ", unsigned int precision = 8) {
440
441 std::ofstream file (filename);
442
443 if (!file.is_open()) {
444 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
445 return;
446 }
447
448 // Write header
449 bool first = true;
450 for (const std::string& name : table.header()) {
451
452 if (!first)
453 file << delimiter;
454
455 file << quote_csv(name);
456 first = false;
457 }
458 file << std::endl;
459
460 // Write data rows
461 size_t max_rows = table.rows();
462 for (size_t i = 0; i < max_rows; ++i) {
463
464 first = true;
465 for (const auto& col : table.data()) {
466
467 if (!first)
468 file << delimiter;
469
470 if (i < col.size())
471 file << std::setprecision(precision) << col[i];
472 else
473 file << nan();
474
475 first = false;
476 }
477 file << std::endl;
478 }
479
480 }
481
482
489 inline void read_csv(const std::string& filename, data_table& table) {
490
491 std::ifstream file (filename);
492 std::string line;
493
494 if (!file.is_open()) {
495 TH_IO_ERROR("io::read_csv", filename, IoError::FileNotFound);
496 return;
497 }
498
499 // Read header
500 if (!std::getline(file, line))
501 return;
502
503 std::vector<std::string> first_row = parse_csv(line);
504
505 if (first_row.empty())
506 return;
507
508 std::vector<std::string> column_names;
509 size_t num_cols = first_row.size();
510 std::vector<vec<real>> columns (num_cols);
511
512 // Check if first line is a header
513 bool has_header = false;
514
515 for (const auto& cell : first_row) {
516
517 if (!io::is_number(cell)) {
518 has_header = true;
519 break;
520 }
521 }
522
523 // If first line is not a header, process it as data
524 if (!has_header) {
525
526 for (size_t j = 0; j < first_row.size(); ++j) {
527
528 std::string cell = first_row[j];
529 std::replace(cell.begin(), cell.end(), ',', '.');
530
531 try {
532 columns[j].append(std::stod(cell));
533 } catch (const std::exception& e) {
534 columns[j].append(nan());
535 }
536 }
537
538 // Generate default column names
539 for (size_t j = 0; j < num_cols; ++j) {
540 column_names.emplace_back("col" + std::to_string(j));
541 }
542
543 } else {
544
545 for (const auto& name : first_row)
546 column_names.emplace_back(io::unquote(io::trim(name)));
547 }
548 first_row.clear();
549
550 // Read data rows
551 while (std::getline(file, line)) {
552
553 if (line.empty())
554 continue;
555
556 std::vector<std::string> cells = parse_csv(line);
557
558 for (size_t j = 0; j < num_cols; ++j) {
559
560 if (j < cells.size()) {
561 std::string cell = cells[j];
562 std::replace(cell.begin(), cell.end(), ',', '.');
563
564 try {
565 real val = std::stod(cell);
566 columns[j].append(val);
567 } catch (const std::exception& e) {
568 columns[j].append(nan());
569 }
570 } else {
571 columns[j].append(nan());
572 }
573 }
574 }
575
576 for (size_t j = 0; j < num_cols; ++j)
577 table.insert(column_names[j], columns[j]);
578 }
579
580
593 inline void write_csv(
594 const std::string& filename, const histogram& hist,
595 bool normalized = false, bool lower_extreme = false,
596 const std::string& delimiter = ", ", unsigned int precision = 8) {
597
598 std::ofstream file (filename);
599 if (!file.is_open()) {
600 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
601 return;
602 }
603
604 const auto bin_counts = hist.bins();
605
606 // Can't write histogram without bins
607 if (!bin_counts.size()) {
608 TH_IO_ERROR("io::write_csv", filename, IoError::FormatError);
609 return;
610 }
611
612 const real bin_dx = (hist.range()[1] - hist.range()[0]) / bin_counts.size();
613 real norm_factor = normalized ? hist.number() * bin_dx : 1.0;
615 norm_factor = 1.0;
616
617 // Keep track of the coordinate of the current bin, starting from the lowest bin edge or center.
618 real bin_value = lower_extreme ? hist.range()[0] : (hist.range()[0] + 0.5 * bin_dx);
619
620 // Write header with histogram statistics
621 file << "bins, counts, number, average, tss, min, max" << std::endl;
622 if (!bin_counts.size())
623 return;
624
625 file << std::setprecision(precision) << bin_value << delimiter;
626 file << std::setprecision(precision) << (bin_counts[0] / norm_factor) << delimiter;
627 file << hist.number() << delimiter;
628 file << hist.mean() << delimiter;
629 file << hist.tss() << delimiter;
630 file << hist.min() << delimiter;
631 file << hist.max() << std::endl;
632
633 for (size_t i = 1; i < bin_counts.size(); i++) {
634
635 bin_value += bin_dx;
636 file << std::setprecision(precision) << bin_value << delimiter;
637 file << std::setprecision(precision) << (bin_counts[i] / norm_factor) << std::endl;
638 }
639 }
640
641
648 inline void read_csv(const std::string& filename, histogram& hist, bool lower_extreme = false) {
649
650 std::ifstream file (filename);
651 if (!file.is_open()) {
652 TH_IO_ERROR("io::read_csv", filename, IoError::FileNotFound);
653 return;
654 }
655
656 std::string line;
657 std::vector<std::string> cells;
658
659 // Read header
660 std::getline(file, line);
662
663 int bin_index = -1, count_index = -1, number_index = -1, average_index = -1;
664 int tss_index = -1, min_index = -1, max_index = -1;
665
666 // Find column indices for the expected headers
667 for (size_t i = 0; i < cells.size(); i++) {
668
669 if (cells[i] == "bins") bin_index = i;
670 else if (cells[i] == "counts") count_index = i;
671 else if (cells[i] == "number") number_index = i;
672 else if (cells[i] == "average") average_index = i;
673 else if (cells[i] == "tss") tss_index = i;
674 else if (cells[i] == "min") min_index = i;
675 else if (cells[i] == "max") max_index = i;
676 }
677
678 if (bin_index == -1 || count_index == -1 || number_index == -1 ||
679 average_index == -1 || tss_index == -1 || min_index == -1 || max_index == -1) {
680
681 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
682 return;
683 }
684
685 // Find the maximum required column index
686 int min_size = bin_index;
693 min_size++;
694
695 // Read first data line for statistics
696 std::getline(file, line);
698
699 if (cells.size() < size_t(min_size)) {
700 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
701 return;
702 }
703
704 vec<real> counts, bins;
705 size_t N;
706 real run_average, run_tss, value_min, value_max;
707
708 try {
709
710 counts = {std::stod(cells[count_index])};
711 bins = {std::stod(cells[bin_index])};
712
713 N = std::stod(cells[number_index]);
714 run_average = std::stod(cells[average_index]);
715 run_tss = std::stod(cells[tss_index]);
716 value_min = std::stod(cells[min_index]);
717 value_max = std::stod(cells[max_index]);
718
719 } catch (const std::invalid_argument& e) {
720 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
721 return;
722 }
723
725
726 // Read remaining data lines
727 while (std::getline(file, line)) {
728
730
731 if (cells.size() < size_t(bins_max_index)) {
732 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
733 return;
734 }
735
736 try {
737 counts.append(cells[count_index] != "" ? std::stod(cells[count_index]) : nan());
738 bins.append(cells[bin_index] != "" ? std::stod(cells[bin_index]) : nan());
739 } catch (const std::exception& e) {
740 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
741 return;
742 }
743
744 }
745
746 real range_min;
747 real range_max;
748
749 bool is_normalized = false;
750 for (size_t i = 0; i < counts.size(); i++) {
751
752 // Check if any bin counts are not integers
753 if (counts[i] != floor(counts[i])) {
754 is_normalized = true;
755 break;
756 }
757 }
758
759 std::vector<unsigned int> bin_counts (counts.size());
760 for (size_t i = 0; i < counts.size(); i++) {
761 bin_counts[i] = (unsigned int) (is_normalized ? (counts[i] * N) : counts[i]);
762 }
763
764 const real bin_dx = bins.size() > 1 ? (bins[1] - bins[0]) : 0;
765 range_min = lower_extreme ? bins[0] : (bins[0] - 0.5 * bin_dx);
766 range_max = lower_extreme ? bins[bins.size() - 1] + bin_dx : bins[bins.size() - 1] + 0.5 * bin_dx;
767
768 // Check constant bin spacing
769 for (size_t i = 1; i < bins.size(); i++) {
770 if (abs((bins[i] - bins[i - 1]) - bin_dx) > 1e-6) {
771 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
772 return;
773 }
774 }
775
776 hist.rebuild(
777 bin_counts, vec2({range_min, range_max}),
778 N, run_average, run_tss,
779 value_min, value_max
780 );
781
782 }
783
793 template<typename Type>
794 inline Type read_csv(const std::string& filename) {
795 Type A;
796 read_csv(filename, A);
797 return A;
798 }
799
800}}
801
802#endif
A data structure for holding labeled columns of data, where each column is a vector of real numbers.
Definition data_table.h:26
Histogram class with running statistics, can be constructed from the parameters of the bins or from a...
Definition histogram.h:28
A generic matrix with a fixed number of rows and columns.
Definition mat.h:136
TH_CONSTEXPR unsigned int rows() const
Returns the number of rows in the matrix.
Definition mat.h:639
TH_CONSTEXPR unsigned int cols() const
Returns the number of columns in the matrix.
Definition mat.h:646
mat< Type, N, K > resize(unsigned int n, unsigned int k)
Compatibility function to allow for allocation or resizing of dynamic matrices.
Definition mat.h:728
A statically allocated N-dimensional vector with elements of the given type.
Definition vec.h:92
void resize(size_t n) const
Compatibility function to allow for allocation or resizing of dynamic vectors.
Definition vec.h:459
TH_CONSTEXPR unsigned int size() const
Returns the size of the vector (N)
Definition vec.h:449
#define TH_MATH_ERROR(F_NAME, VALUE, EXCEPTION)
TH_MATH_ERROR is a macro which throws exceptions or modifies errno (depending on which compilation op...
Definition error.h:225
Data table structure for holding labeled columns of data.
Error handling for IO operations.
Vector1 & vec_copy(Vector1 &dest, const Vector2 &src)
Copy a vector by overwriting another.
Definition algebra.h:195
Matrix & mat_error(Matrix &m)
Overwrite the given matrix with the error matrix with NaN values on the diagonal and zeroes everywher...
Definition algebra.h:40
Vector & vec_error(Vector &v)
Overwrite the given vector with the error vector with NaN values.
Definition algebra.h:58
bool is_number(const std::string &str)
Check if a given string could be correctly interpreted as a number.
Definition strings.h:16
std::string trim(const std::string &str)
Remove all leading and trailing whitespace from a string, returning the resulting string.
Definition strings.h:36
@ FileNotFound
File or directory not found.
@ FormatError
The file format is invalid or the data is corrupted.
@ ReadError
Error occurred while reading from the file or stream.
std::string quote_csv(const std::string &str)
Given a string entry, sanitize it for printing to a CSV file.
Definition csv.h:62
std::vector< std::string > parse_csv(const std::string &line, char delimiter=',')
Parse a CSV line handling quoted fields.
Definition csv.h:32
void read_csv(const std::string &filename, vec< Type, N > &v)
Read a vector from a file in the CSV format.
Definition csv.h:131
void write_csv(const std::string &filename, const vec< Type, N > &v, unsigned int precision=8)
Write a vector to file in the CSV format.
Definition csv.h:86
std::string unquote(const std::string &str)
Remove leading and trailing double quotes from a string, if both are present.
Definition strings.h:54
Main namespace of the library which contains all functions and objects.
Definition algebra.h:27
double real
A real number, defined as a floating point type.
Definition constants.h:207
auto min(const Vector &X)
Finds the minimum value inside a dataset.
Definition dataset.h:351
vec< real, 2 > vec2
A 2-dimensional vector with real elements.
Definition algebra_types.h:39
bool is_nan(const T &x)
Check whether a generic variable is (equivalent to) a NaN number.
Definition error.h:94
dual2 abs(dual2 x)
Compute the absolute value of a second order dual number.
Definition dual2_functions.h:242
TH_CONSTEXPR Type make_error()
Create a number representing an error state, constructed from a NaN value.
Definition real_analysis.h:1322
TH_CONSTEXPR real nan()
Return a quiet NaN number in floating point representation.
Definition error.h:78
@ OutOfRange
Result out of range.
constexpr real MACH_EPSILON
Machine epsilon for the real type.
Definition constants.h:216
TH_CONSTEXPR int floor(real x)
Compute the floor of x, as the maximum integer number that is smaller than x.
Definition real_analysis.h:271