Theoretica
Scientific Computing
Loading...
Searching...
No Matches
csv.h
Go to the documentation of this file.
1
5
6#ifndef THEORETICA_IO_CSV_H
7#define THEORETICA_IO_CSV_H
8
9#include <fstream>
10#include <iomanip>
11#include <algorithm>
12
13#include "./error.h"
14#include "../algebra/vec.h"
15#include "../algebra/mat.h"
16#include "../statistics/histogram.h"
17#include "./data_table.h"
18#include "./strings.h"
19
20
21namespace theoretica {
22namespace io {
23
24
32 inline std::vector<std::string> parse_csv(const std::string& line, char delimiter = ',') {
33
34 std::vector<std::string> fields;
35 std::string field;
36 bool quoted = false;
37
38 for (size_t i = 0; i < line.length(); ++i) {
39
40 char c = line[i];
41
42 if (c == '"') {
43 quoted = !quoted;
44 } else if (c == delimiter && !quoted) {
45 fields.emplace_back(field);
46 field.clear();
47 } else if(!std::isspace(c) || quoted) {
48 field += c;
49 }
50 }
51
52 fields.emplace_back(field);
53 return fields;
54 }
55
56
62 inline std::string quote_csv(const std::string& str) {
63
64 bool has_whitespace = false;
65 for (char c : str) {
66
67 if (std::isspace(c)) {
68 has_whitespace = true;
69 break;
70 }
71 }
72
73 if (str.find(',') != std::string::npos || has_whitespace)
74 return "\"" + str + "\"";
75 else
76 return str;
77 }
78
79
85 template<typename Type, unsigned int N>
86 inline void write_csv(
87 const std::string& filename, const vec<Type, N>& v, unsigned int precision = 8) {
88
89 std::ofstream file (filename);
90 if (!file.is_open()) {
91 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
92 return;
93 }
94
95 for (size_t i = 0; i < v.size(); ++i)
96 file << std::setprecision(precision) << v[i] << std::endl;
97 }
98
99
105 template<typename Type, unsigned int N>
106 inline void write_csv(
107 const std::string& filename, const std::string& header,
108 const vec<Type, N>& v, unsigned int precision = 8) {
109
110 std::ofstream file (filename);
111 if (!file.is_open()) {
112 // TODO: throw another exception ?
113 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
114 return;
115 }
116
117 file << "\"" << header << "\"" << std::endl;
118 for (size_t i = 0; i < v.size(); ++i)
119 file << std::setprecision(precision) << v[i] << std::endl;
120 }
121
122
130 template<typename Type, unsigned int N, enable_real<Type> = true>
131 inline void read_csv(const std::string& filename, vec<Type, N>& v) {
132
133 std::ifstream file (filename);
134 std::string line;
135
136 if (!file.is_open()) {
137 TH_IO_ERROR("io::read_csv", filename, IoError::ReadError);
138 return;
139 }
140
141 // Check for header
142 if (!std::getline(file, line))
143 return;
144
146
147 // Resulting column vector
148 std::vector<real> col;
149
150 if (io::is_number(line)) {
151
152 real first;
153
154 try {
155 std::replace(line.begin(), line.end(), ',', '.');
156 first = std::stod(line);
157 col.emplace_back(first);
158 } catch (const std::invalid_argument& e) {
159 // Do nothing, the entry is not a number
160 } catch(const std::out_of_range& e) {
161 TH_MATH_ERROR("io::read_csv", first, MathError::OutOfRange);
162 }
163 }
164
165 // All remaining lines are data
166 while (std::getline(file, line)) {
167
169 std::replace(line.begin(), line.end(), ',', '.');
170
171 try {
172 real val = std::stod(line);
173 col.emplace_back(val);
174 } catch (const std::exception& e) {
175 col.emplace_back(nan());
176 }
177 }
178
179 // Handle mismatched sizes with empty values (NaN)
180 if (v.size() > col.size()) {
181
182 for (size_t i = 0; i < col.size(); i++)
183 v[i] = col[i];
184
185 for (size_t i = col.size(); i < v.size(); i++)
186 v[i] = nan();
187
188 } else {
189 algebra::vec_copy(v, col);
190 }
191 }
192
193
204 template<typename Type, unsigned int N, enable_real<Type> = true>
205 inline void read_csv(
206 const std::string& filename, const std::string& col_name,
207 vec<Type, N>& v, bool trim_nan = true) {
208
209 std::ifstream file (filename);
210 std::string line;
211
212 if (!file.is_open()) {
213 TH_IO_ERROR("io::read_csv", filename, IoError::ReadError);
215 return;
216 }
217
218 // Read header
219 if (!std::getline(file, line))
220 return;
221
222 // Find the index of the specified column
223 std::vector<std::string> headers = parse_csv(line);
224 int col_index = -1;
225 for (size_t i = 0; i < headers.size(); ++i) {
226
227 if (headers[i] == col_name) {
228 col_index = i;
229 break;
230 }
231 }
232
233 // No column was found
234 if (col_index == -1) {
235
236 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
237
238 if (!v.size())
239 v.resize(1);
240
241 for (size_t i = 0; i < v.size(); ++i)
242 v[i] = nan();
243
244 return;
245 }
246
247 // Read data from the specified column
248 std::vector<real> data;
249 std::vector<std::string> cells;
250
251 while (std::getline(file, line)) {
252
254
255 if (size_t(col_index) < cells.size()) {
256
257 std::string cell = cells[col_index];
258 std::replace(cell.begin(), cell.end(), ',', '.');
259
260 try {
261 const real val = std::stod(cell);
262 data.emplace_back(val);
263 } catch (const std::exception& e) {
264 data.emplace_back(nan());
265 }
266 } else {
267 data.emplace_back(nan());
268 }
269 }
270
271 // Trim trailing NaN values if enabled
272 size_t actual_size = data.size();
273
274 if (trim_nan)
275 while (actual_size > 0 && is_nan(data[actual_size - 1]))
276 actual_size--;
277
278 // Need to allocate space
279 if (v.size() < actual_size) {
280
282
283 if (v.size() < actual_size) {
284 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
286 return;
287 }
288 }
289
290 for (size_t i = 0; i < actual_size; i++)
291 v[i] = data[i];
292
293 for (size_t i = actual_size; i < v.size(); i++)
294 v[i] = nan();
295 }
296
297
302 template<typename Type, unsigned int N, unsigned int M>
303 inline void write_csv(
304 const std::string& filename, const mat<Type, N, M>& A,
305 const std::string& delimiter = ", ", unsigned int precision = 8) {
306
307 std::ofstream file (filename);
308
309 if (!file.is_open()) {
310 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
311 return;
312 }
313
314 for (size_t i = 0; i < A.rows(); i++) {
315 for (size_t j = 0; j < A.cols(); j++) {
316
317 file << std::setprecision(precision) << A(i, j);
318
319 if (j != A.cols() - 1)
320 file << delimiter;
321 else
322 file << std::endl;
323 }
324 }
325 }
326
327
334 template<unsigned int N, unsigned int K>
335 inline void read_csv(const std::string& filename, mat<real, N, K>& A) {
336
337 std::ifstream file (filename);
338 std::string line;
339
340 if (!file.is_open()) {
341 TH_IO_ERROR("io::read_csv", filename, IoError::FileNotFound);
342 return;
343 }
344
345 std::vector<std::vector<real>> rows;
346
347 // Read first line to check for header
348 if (!std::getline(file, line))
349 return;
350
351 std::vector<std::string> first_row = parse_csv(line);
352
353 // Check if first line is a header
354 bool has_header = false;
355
356 for (const auto& cell : first_row) {
357
358 if (!io::is_number(cell)) {
359 has_header = true;
360 break;
361 }
362 }
363
364 // If first line is not a header, process it as data
365 if (!has_header) {
366
367 std::vector<real> row;
368
369 for (auto cell : first_row) {
370
371 std::replace(cell.begin(), cell.end(), ',', '.');
372
373 try {
374 row.emplace_back(std::stod(cell));
375 } catch (const std::exception& e) {
376 row.emplace_back(nan());
377 }
378 }
379
380 if (!row.empty())
381 rows.emplace_back(row);
382 }
383
384 // Read remaining lines
385 while (std::getline(file, line)) {
386
387 // Skip empty lines
388 if (line.empty())
389 continue;
390
391 std::vector<std::string> cells = parse_csv(line);
392 std::vector<real> row;
393
394 for (auto cell : cells) {
395
396 std::replace(cell.begin(), cell.end(), ',', '.');
397
398 try {
399 row.emplace_back(std::stod(cell));
400 } catch (const std::exception& e) {
401 row.emplace_back(nan());
402 }
403 }
404
405 if (!row.empty()) {
406 rows.emplace_back(row);
407 }
408 }
409
410 A.resize(rows.size(), rows[0].size());
411
412 if (A.rows() < rows[0].size() || A.cols() < rows.size()) {
413 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
415 return;
416 }
417
418 // Fill matrix with parsed data
419 for (size_t i = 0; i < min(rows.size(), A.rows()); ++i) {
420
421 for (size_t j = 0; j < min(rows[i].size(), A.cols()); ++j)
422 A(i, j) = rows[i][j];
423
424 // Pad remaining columns with NaN
425 for (size_t j = rows[i].size(); j < A.cols(); ++j)
426 A(i, j) = nan();
427 }
428
429 // Pad remaining rows with NaN
430 for (size_t i = rows.size(); i < A.rows(); ++i)
431 for (size_t j = 0; j < A.cols(); ++j)
432 A(i, j) = nan();
433 }
434
435
440 inline void write_csv(
441 const std::string& filename, const data_table& table,
442 const std::string& delimiter = ", ", unsigned int precision = 8) {
443
444 std::ofstream file (filename);
445
446 if (!file.is_open()) {
447 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
448 return;
449 }
450
451 // Write header
452 bool first = true;
453 for (const std::string& name : table.header()) {
454
455 if (!first)
456 file << delimiter;
457
458 file << quote_csv(name);
459 first = false;
460 }
461 file << std::endl;
462
463 // Write data rows
464 size_t max_rows = table.rows();
465 for (size_t i = 0; i < max_rows; ++i) {
466
467 first = true;
468 for (const auto& col : table.data()) {
469
470 if (!first)
471 file << delimiter;
472
473 if (i < col.size())
474 file << std::setprecision(precision) << col[i];
475 else
476 file << nan();
477
478 first = false;
479 }
480 file << std::endl;
481 }
482
483 }
484
485
492 inline void read_csv(const std::string& filename, data_table& table) {
493
494 std::ifstream file (filename);
495 std::string line;
496
497 if (!file.is_open()) {
498 TH_IO_ERROR("io::read_csv", filename, IoError::FileNotFound);
499 return;
500 }
501
502 // Read header
503 if (!std::getline(file, line))
504 return;
505
506 std::vector<std::string> first_row = parse_csv(line);
507
508 if (first_row.empty())
509 return;
510
511 std::vector<std::string> column_names;
512 size_t num_cols = first_row.size();
513 std::vector<vec<real>> columns (num_cols);
514
515 // Check if first line is a header
516 bool has_header = false;
517
518 for (const auto& cell : first_row) {
519
520 if (!io::is_number(cell)) {
521 has_header = true;
522 break;
523 }
524 }
525
526 // If first line is not a header, process it as data
527 if (!has_header) {
528
529 for (size_t j = 0; j < first_row.size(); ++j) {
530
531 std::string cell = first_row[j];
532 std::replace(cell.begin(), cell.end(), ',', '.');
533
534 try {
535 columns[j].append(std::stod(cell));
536 } catch (const std::exception& e) {
537 columns[j].append(nan());
538 }
539 }
540
541 // Generate default column names
542 for (size_t j = 0; j < num_cols; ++j) {
543 column_names.emplace_back("col" + std::to_string(j));
544 }
545
546 } else {
547
548 for (const auto& name : first_row)
549 column_names.emplace_back(io::unquote(io::trim(name)));
550 }
551 first_row.clear();
552
553 // Read data rows
554 while (std::getline(file, line)) {
555
556 if (line.empty())
557 continue;
558
559 std::vector<std::string> cells = parse_csv(line);
560
561 for (size_t j = 0; j < num_cols; ++j) {
562
563 if (j < cells.size()) {
564 std::string cell = cells[j];
565 std::replace(cell.begin(), cell.end(), ',', '.');
566
567 try {
568 real val = std::stod(cell);
569 columns[j].append(val);
570 } catch (const std::exception& e) {
571 columns[j].append(nan());
572 }
573 } else {
574 columns[j].append(nan());
575 }
576 }
577 }
578
579 for (size_t j = 0; j < num_cols; ++j)
580 table.insert(column_names[j], columns[j]);
581 }
582
583
596 inline void write_csv(
597 const std::string& filename, const histogram& hist,
598 bool normalized = false, bool lower_extreme = false,
599 const std::string& delimiter = ", ", unsigned int precision = 8) {
600
601 std::ofstream file (filename);
602 if (!file.is_open()) {
603 TH_IO_ERROR("io::write_csv", filename, IoError::FileNotFound);
604 return;
605 }
606
607 const auto bin_counts = hist.bins();
608
609 // Can't write histogram without bins
610 if (!bin_counts.size()) {
611 TH_IO_ERROR("io::write_csv", filename, IoError::FormatError);
612 return;
613 }
614
615 const real bin_dx = (hist.range()[1] - hist.range()[0]) / bin_counts.size();
616 real norm_factor = normalized ? hist.number() * bin_dx : 1.0;
618 norm_factor = 1.0;
619
620 // Keep track of the coordinate of the current bin, starting from the lowest bin edge or center.
621 real bin_value = lower_extreme ? hist.range()[0] : (hist.range()[0] + 0.5 * bin_dx);
622
623 // Write header with histogram statistics
624 file << "bins, counts, number, average, tss, min, max" << std::endl;
625 if (!bin_counts.size())
626 return;
627
628 file << std::setprecision(precision) << bin_value << delimiter;
629 file << std::setprecision(precision) << (bin_counts[0] / norm_factor) << delimiter;
630 file << hist.number() << delimiter;
631 file << hist.mean() << delimiter;
632 file << hist.tss() << delimiter;
633 file << hist.min() << delimiter;
634 file << hist.max() << std::endl;
635
636 for (size_t i = 1; i < bin_counts.size(); i++) {
637
638 bin_value += bin_dx;
639 file << std::setprecision(precision) << bin_value << delimiter;
640 file << std::setprecision(precision) << (bin_counts[i] / norm_factor) << std::endl;
641 }
642 }
643
644
651 inline void read_csv(const std::string& filename, histogram& hist, bool lower_extreme = false) {
652
653 std::ifstream file (filename);
654 if (!file.is_open()) {
655 TH_IO_ERROR("io::read_csv", filename, IoError::FileNotFound);
656 return;
657 }
658
659 std::string line;
660 std::vector<std::string> cells;
661
662 // Read header
663 std::getline(file, line);
665
666 int bin_index = -1, count_index = -1, number_index = -1, average_index = -1;
667 int tss_index = -1, min_index = -1, max_index = -1;
668
669 // Find column indices for the expected headers
670 for (size_t i = 0; i < cells.size(); i++) {
671
672 if (cells[i] == "bins") bin_index = i;
673 else if (cells[i] == "counts") count_index = i;
674 else if (cells[i] == "number") number_index = i;
675 else if (cells[i] == "average") average_index = i;
676 else if (cells[i] == "tss") tss_index = i;
677 else if (cells[i] == "min") min_index = i;
678 else if (cells[i] == "max") max_index = i;
679 }
680
681 if (bin_index == -1 || count_index == -1 || number_index == -1 ||
682 average_index == -1 || tss_index == -1 || min_index == -1 || max_index == -1) {
683
684 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
685 return;
686 }
687
688 // Find the maximum required column index
689 int min_size = bin_index;
696 min_size++;
697
698 // Read first data line for statistics
699 std::getline(file, line);
701
702 if (cells.size() < size_t(min_size)) {
703 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
704 return;
705 }
706
707 vec<real> counts, bins;
708 size_t N;
709 real run_average, run_tss, value_min, value_max;
710
711 try {
712
713 counts = {std::stod(cells[count_index])};
714 bins = {std::stod(cells[bin_index])};
715
716 N = std::stod(cells[number_index]);
717 run_average = std::stod(cells[average_index]);
718 run_tss = std::stod(cells[tss_index]);
719 value_min = std::stod(cells[min_index]);
720 value_max = std::stod(cells[max_index]);
721
722 } catch (const std::invalid_argument& e) {
723 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
724 return;
725 }
726
728
729 // Read remaining data lines
730 while (std::getline(file, line)) {
731
733
734 if (cells.size() < size_t(bins_max_index)) {
735 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
736 return;
737 }
738
739 try {
740 counts.append(cells[count_index] != "" ? std::stod(cells[count_index]) : nan());
741 bins.append(cells[bin_index] != "" ? std::stod(cells[bin_index]) : nan());
742 } catch (const std::exception& e) {
743 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
744 return;
745 }
746
747 }
748
749 real range_min;
750 real range_max;
751
752 bool is_normalized = false;
753 for (size_t i = 0; i < counts.size(); i++) {
754
755 // Check if any bin counts are not integers
756 if (counts[i] != floor(counts[i])) {
757 is_normalized = true;
758 break;
759 }
760 }
761
762 std::vector<unsigned int> bin_counts (counts.size());
763 for (size_t i = 0; i < counts.size(); i++) {
764 bin_counts[i] = (unsigned int) (is_normalized ? (counts[i] * N) : counts[i]);
765 }
766
767 const real bin_dx = bins.size() > 1 ? (bins[1] - bins[0]) : 0;
768 range_min = lower_extreme ? bins[0] : (bins[0] - 0.5 * bin_dx);
769 range_max = lower_extreme ? bins[bins.size() - 1] + bin_dx : bins[bins.size() - 1] + 0.5 * bin_dx;
770
771 // Check constant bin spacing
772 for (size_t i = 1; i < bins.size(); i++) {
773 if (abs((bins[i] - bins[i - 1]) - bin_dx) > 1e-6) {
774 TH_IO_ERROR("io::read_csv", filename, IoError::FormatError);
775 return;
776 }
777 }
778
779 hist.rebuild(
780 bin_counts, vec2({range_min, range_max}),
781 N, run_average, run_tss,
782 value_min, value_max
783 );
784
785 }
786
796 template<typename Type>
797 inline Type read_csv(const std::string& filename) {
798 Type A;
799 read_csv(filename, A);
800 return A;
801 }
802
803}}
804
805#endif
A data structure for holding labeled columns of data, where each column is a vector of real numbers.
Definition data_table.h:26
Histogram class with running statistics, can be constructed from the parameters of the bins or from a...
Definition histogram.h:28
A generic matrix with a fixed number of rows and columns.
Definition mat.h:136
TH_CONSTEXPR unsigned int rows() const
Returns the number of rows in the matrix.
Definition mat.h:641
TH_CONSTEXPR unsigned int cols() const
Returns the number of columns in the matrix.
Definition mat.h:648
mat< Type, N, K > resize(unsigned int n, unsigned int k)
Compatibility function to allow for allocation or resizing of dynamic matrices.
Definition mat.h:730
A statically allocated N-dimensional vector with elements of the given type.
Definition vec.h:92
void resize(size_t n) const
Compatibility function to allow for allocation or resizing of dynamic vectors.
Definition vec.h:459
TH_CONSTEXPR unsigned int size() const
Returns the size of the vector (N)
Definition vec.h:449
#define TH_MATH_ERROR(F_NAME, VALUE, EXCEPTION)
TH_MATH_ERROR is a macro which throws exceptions or modifies errno (depending on which compilation op...
Definition error.h:219
Data table structure for holding labeled columns of data.
Error handling for IO operations.
Vector1 & vec_copy(Vector1 &dest, const Vector2 &src)
Copy a vector by overwriting another.
Definition algebra.h:241
Matrix & mat_error(Matrix &m)
Overwrite the given matrix with the error matrix with NaN values on the diagonal and zeroes everywher...
Definition algebra.h:40
Vector & vec_error(Vector &v)
Overwrite the given vector with the error vector with NaN values.
Definition algebra.h:58
bool is_number(const std::string &str)
Check if a given string could be correctly interpreted as a number.
Definition strings.h:20
std::string trim(const std::string &str)
Remove all leading and trailing whitespace from a string, returning the resulting string.
Definition strings.h:40
@ FileNotFound
File or directory not found.
@ FormatError
The file format is invalid or the data is corrupted.
@ ReadError
Error occurred while reading from the file or stream.
std::string quote_csv(const std::string &str)
Given a string entry, sanitize it for printing to a CSV file.
Definition csv.h:62
std::vector< std::string > parse_csv(const std::string &line, char delimiter=',')
Parse a CSV line handling quoted fields.
Definition csv.h:32
void read_csv(const std::string &filename, vec< Type, N > &v)
Read a vector from a file in the CSV format.
Definition csv.h:131
void write_csv(const std::string &filename, const vec< Type, N > &v, unsigned int precision=8)
Write a vector to file in the CSV format.
Definition csv.h:86
std::string unquote(const std::string &str)
Remove leading and trailing double quotes from a string, if both are present.
Definition strings.h:58
Main namespace of the library which contains all functions and objects.
Definition algebra.h:27
double real
A real number, defined as a floating point type.
Definition constants.h:207
auto min(const Vector &X)
Finds the minimum value inside a dataset.
Definition dataset.h:347
vec< real, 2 > vec2
A 2-dimensional vector with real elements.
Definition algebra_types.h:39
bool is_nan(const T &x)
Check whether a generic variable is (equivalent to) a NaN number.
Definition error.h:90
dual2 abs(dual2 x)
Compute the absolute value of a second order dual number.
Definition dual2_functions.h:242
Vector make_error()
Create a vector representing an error state, with all NaN values.
Definition algebra.h:103
TH_CONSTEXPR real nan()
Return a quiet NaN number in floating point representation.
Definition error.h:74
@ OutOfRange
Result out of range.
constexpr real MACH_EPSILON
Machine epsilon for the real type.
Definition constants.h:216
TH_CONSTEXPR int floor(real x)
Compute the floor of x, as the maximum integer number that is smaller than x.
Definition real_analysis.h:271
String manipulation functions.