From 899facc74b5772a5a8c89be2d7d56375844519ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96mer=20Faruk=20Demirel?= Date: Tue, 31 Mar 2026 08:29:51 +0300 Subject: [PATCH] feat: frame implemented --- CMakeLists.txt | 24 +- src/frame/frame.h | 712 ++++++++++++++++++++++++++++++++++++ src/frame/frame_example.cpp | 233 ++++++++++++ src/frame/frame_test.cpp | 405 ++++++++++++++++++++ src/frame/io/csv_parser.h | 137 +++++++ src/frame/io/json_parser.h | 266 ++++++++++++++ src/frame/ml.h | 458 +++++++++++++++++++++++ src/frame/series.h | 684 ++++++++++++++++++++++++++++++++++ src/frame/stats.h | 275 ++++++++++++++ 9 files changed, 3182 insertions(+), 12 deletions(-) create mode 100644 src/frame/frame.h create mode 100644 src/frame/frame_example.cpp create mode 100644 src/frame/frame_test.cpp create mode 100644 src/frame/io/csv_parser.h create mode 100644 src/frame/io/json_parser.h create mode 100644 src/frame/ml.h create mode 100644 src/frame/series.h create mode 100644 src/frame/stats.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5885b40..9366cc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,26 +11,21 @@ add_library(tensor_engine STATIC ) target_include_directories(tensor_engine PUBLIC src) -add_library(linear_regression STATIC - src/supervised/regression/linear_regression.cpp +add_library(frame STATIC + src/frame/frame.h ) -target_include_directories(linear_regression PUBLIC src) -target_link_libraries(linear_regression PUBLIC tensor_engine) +target_include_directories(frame PUBLIC src) +target_link_libraries(frame PUBLIC tensor_engine) add_executable(tensor_benchmark src/tensor/tensor_benchmark.cpp ) target_link_libraries(tensor_benchmark tensor_engine) -add_executable(linear_regression_example - src/supervised/regression/linear_regression_example.cpp +add_executable(frame_example + src/frame/frame_example.cpp ) -target_link_libraries(linear_regression_example linear_regression) - -add_executable(linear_regression_benchmark - src/supervised/regression/linear_regression_benchmark.cpp -) -target_link_libraries(linear_regression_benchmark linear_regression) +target_link_libraries(frame_example frame) enable_testing() @@ -46,3 +41,8 @@ add_executable(tensor_test src/tensor/tensor_test.cpp) target_link_libraries(tensor_test tensor_engine GTest::gtest_main) include(GoogleTest) gtest_discover_tests(tensor_test) + +add_executable(frame_test src/frame/frame_test.cpp) +target_link_libraries(frame_test frame GTest::gtest_main) +include(GoogleTest) +gtest_discover_tests(frame_test) diff --git a/src/frame/frame.h b/src/frame/frame.h new file mode 100644 index 0000000..cbc0923 --- /dev/null +++ b/src/frame/frame.h @@ -0,0 +1,712 @@ +#pragma once + +#include "../tensor/tensor.h" +#include "../tensor/cpu_engine.h" +#include "series.h" +#include "io/csv_parser.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ml { + +template +class DataFrame { +public: + using ColumnType = Series; + + DataFrame() : rows_(0), engine_(std::make_shared>()) {} + + explicit DataFrame(size_t rows, size_t cols) : rows_(rows), engine_(std::make_shared>()) { + for (size_t i = 0; i < cols; ++i) { + column_names_.push_back("column_" + std::to_string(i)); + columns_.emplace_back("column_" + std::to_string(i), engine_); + } + rebuild_index(); + } + + size_t rows() const { return rows_; } + size_t cols() const { return columns_.size(); } + bool empty() const { return rows_ == 0; } + + const std::vector& columns() const { return column_names_; } + const std::string& columns(size_t i) const { return column_names_[i]; } + + Series& add_column(const std::string& name) { + if (column_index_.count(name)) { + throw std::invalid_argument("Column already exists: " + name); + } + column_names_.push_back(name); + columns_.emplace_back(name, engine_); + column_index_[name] = columns_.size() - 1; + return columns_.back(); + } + + Series& add_column(const std::string& name, const std::vector& data) { + auto& col = add_column(name); + for (const auto& val : data) { + col.push_back(val); + } + if (rows_ == 0) { + rows_ = col.size(); + } else if (col.size() != rows_) { + throw std::invalid_argument("Row count mismatch"); + } + return col; + } + + void remove_column(const std::string& name) { + auto it = column_index_.find(name); + if (it == column_index_.end()) { + throw std::invalid_argument("Column not found: " + name); + } + column_names_.erase(column_names_.begin() + it->second); + columns_.erase(columns_.begin() + it->second); + rebuild_index(); + } + + Series& col(const std::string& name) { + auto it = column_index_.find(name); + if (it == column_index_.end()) { + throw std::invalid_argument("Column not found: " + name); + } + return columns_[it->second]; + } + + const Series& col(const std::string& name) const { + auto it = column_index_.find(name); + if (it == column_index_.end()) { + throw std::invalid_argument("Column not found: " + name); + } + return columns_[it->second]; + } + + Series& col(size_t idx) { + if (idx >= columns_.size()) { + throw std::out_of_range("Column index out of range"); + } + return columns_[idx]; + } + + const Series& col(size_t idx) const { + if (idx >= columns_.size()) { + throw std::out_of_range("Column index out of range"); + } + return columns_[idx]; + } + + DataFrame select(const std::vector& cols) const { + DataFrame result; + for (const auto& name : cols) { + auto it = column_index_.find(name); + if (it == column_index_.end()) { + throw std::invalid_argument("Column not found: " + name); + } + result.columns_.push_back(columns_[it->second]); + result.column_names_.push_back(name); + } + result.rows_ = rows_; + result.rebuild_index(); + return result; + } + + DataFrame head(size_t n) const { + n = std::min(n, rows_); + return iloc(0, n); + } + + DataFrame tail(size_t n) const { + n = std::min(n, rows_); + return iloc(rows_ - n, rows_); + } + + DataFrame iloc(size_t row_idx) const { + return iloc(std::vector{row_idx}); + } + + DataFrame iloc(size_t start_row, size_t end_row) const { + std::vector row_idxs(end_row - start_row); + std::iota(row_idxs.begin(), row_idxs.end(), start_row); + return iloc(row_idxs); + } + + DataFrame iloc(const std::vector& row_idxs) const { + DataFrame result; + result.column_names_ = column_names_; + result.columns_ = columns_; + result.rows_ = row_idxs.size(); + result.engine_ = engine_; + result.rebuild_index(); + + for (size_t i = 0; i < result.columns_.size(); ++i) { + Series sampled(result.column_names_[i], result.engine_); + for (size_t idx : row_idxs) { + if (idx < columns_[i].size()) { + sampled.push_back(columns_[i][idx]); + } + } + result.columns_[i] = std::move(sampled); + } + return result; + } + + DataFrame sort_values(const std::string& by, bool ascending = true) const { + auto it = column_index_.find(by); + if (it == column_index_.end()) { + throw std::invalid_argument("Column not found: " + by); + } + + std::vector indices(rows_); + std::iota(indices.begin(), indices.end(), 0); + + const auto& key_col = columns_[it->second]; + std::vector key_data = key_col.to_vector(); + + std::sort(indices.begin(), indices.end(), + [ascending, &key_data](size_t a, size_t b) { + return ascending ? (key_data[a] < key_data[b]) : (key_data[a] > key_data[b]); + }); + + return iloc(indices); + } + + DataFrame dropna() const { + std::vector keep_idxs; + for (size_t i = 0; i < rows_; ++i) { + bool keep = true; + for (const auto& col : columns_) { + if (i < col.size() && std::isnan(static_cast(col[i]))) { + keep = false; + break; + } + } + if (keep) keep_idxs.push_back(i); + } + return iloc(keep_idxs); + } + + DataFrame fillna(T value) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col.fillna(value); + } + return result; + } + + std::optional index_of(const std::string& column) const { + auto it = column_index_.find(column); + if (it != column_index_.end()) { + return it->second; + } + return std::nullopt; + } + + T at(size_t row, size_t col) const { + if (col >= columns_.size()) { + throw std::out_of_range("Column index out of range"); + } + if (row >= columns_[col].size()) { + throw std::out_of_range("Row index out of range"); + } + return columns_[col][row]; + } + + bool contains(const std::string& column) const { + return column_index_.count(column) > 0; + } + + void to_csv(const std::string& filename, char delimiter = ',') const { + std::ofstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filename); + } + to_csv(file, delimiter); + } + + void to_csv(std::ostream& os, char delimiter = ',') const { + for (size_t i = 0; i < column_names_.size(); ++i) { + if (i > 0) os << delimiter; + os << column_names_[i]; + } + os << "\n"; + + for (size_t r = 0; r < rows_; ++r) { + for (size_t c = 0; c < columns_.size(); ++c) { + if (c > 0) os << delimiter; + if (r < columns_[c].size()) { + os << columns_[c][r]; + } + } + os << "\n"; + } + } + + static DataFrame from_csv(const std::string& filename, char delimiter = ',', + bool header = true, size_t skip_rows = 0) { + std::ifstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filename); + } + return from_csv(file, delimiter, header, skip_rows); + } + + static DataFrame from_csv(std::istream& is, char delimiter = ',', + bool header = true, size_t skip_rows = 0) { + DataFrame result; + auto engine = std::make_shared>(); + + std::string line; + for (size_t i = 0; i < skip_rows && std::getline(is, line); ++i) {} + + if (header && std::getline(is, line)) { + std::vector headers = CSVParser::parse_line(line, delimiter); + for (const auto& h : headers) { + result.column_names_.push_back(h); + } + } + + std::vector> data_cols(result.column_names_.size()); + + while (std::getline(is, line)) { + auto values = CSVParser::parse_line(line, delimiter); + for (size_t i = 0; i < std::min(values.size(), result.column_names_.size()); ++i) { + try { + data_cols[i].push_back(static_cast(std::stod(values[i]))); + } catch (...) { + data_cols[i].push_back(std::nan("")); + } + } + } + + result.engine_ = engine; + for (size_t i = 0; i < result.column_names_.size(); ++i) { + Series col(result.column_names_[i], engine); + for (const auto& val : data_cols[i]) { + col.push_back(val); + } + result.columns_.push_back(col); + } + + if (!result.column_names_.empty()) { + result.rows_ = data_cols[0].size(); + } + result.rebuild_index(); + result.index_.resize(result.rows_); + for (size_t i = 0; i < result.rows_; ++i) { + result.index_[i] = std::to_string(i); + } + + return result; + } + + void print(std::ostream& os = std::cout, size_t max_rows = 10) const { + os << to_string(max_rows); + } + + std::string to_string(size_t max_rows = 10) const { + std::ostringstream oss; + + size_t display_rows = std::min(rows_, max_rows); + + oss << std::string(15 * columns_.size() + 2, '-') << "\n"; + for (size_t j = 0; j < column_names_.size(); ++j) { + oss << std::setw(14) << std::left << column_names_[j] << " |"; + } + oss << "\n" << std::string(15 * columns_.size() + 2, '-') << "\n"; + + for (size_t i = 0; i < display_rows; ++i) { + for (size_t j = 0; j < columns_.size(); ++j) { + if (i < columns_[j].size()) { + oss << std::setw(14) << std::left << columns_[j][i] << " |"; + } else { + oss << std::setw(14) << std::left << "NaN" << " |"; + } + } + oss << "\n"; + } + + if (rows_ > display_rows) { + oss << "... " << (rows_ - display_rows) << " more rows\n"; + } + oss << std::string(15 * columns_.size() + 2, '-') << "\n"; + + return oss.str(); + } + + DataFrame operator+(const DataFrame& other) const { + DataFrame result = *this; + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + result.columns_[i] = columns_[i] + other.columns_[i]; + } + return result; + } + + DataFrame operator-(const DataFrame& other) const { + DataFrame result = *this; + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + result.columns_[i] = columns_[i] - other.columns_[i]; + } + return result; + } + + DataFrame operator*(const DataFrame& other) const { + DataFrame result = *this; + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + result.columns_[i] = columns_[i] * other.columns_[i]; + } + return result; + } + + DataFrame operator/(const DataFrame& other) const { + DataFrame result = *this; + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + result.columns_[i] = columns_[i] / other.columns_[i]; + } + return result; + } + + DataFrame operator+(T scalar) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col + scalar; + } + return result; + } + + DataFrame operator-(T scalar) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col - scalar; + } + return result; + } + + DataFrame operator*(T scalar) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col * scalar; + } + return result; + } + + DataFrame operator/(T scalar) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col / scalar; + } + return result; + } + + DataFrame& operator+=(const DataFrame& other) { + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + columns_[i] += other.columns_[i]; + } + return *this; + } + + DataFrame& operator-=(const DataFrame& other) { + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + columns_[i] -= other.columns_[i]; + } + return *this; + } + + DataFrame& operator*=(const DataFrame& other) { + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + columns_[i] *= other.columns_[i]; + } + return *this; + } + + DataFrame& operator/=(const DataFrame& other) { + for (size_t i = 0; i < std::min(columns_.size(), other.columns_.size()); ++i) { + columns_[i] /= other.columns_[i]; + } + return *this; + } + + DataFrame& operator+=(T scalar) { + for (auto& col : columns_) { + col += scalar; + } + return *this; + } + + DataFrame& operator-=(T scalar) { + for (auto& col : columns_) { + col -= scalar; + } + return *this; + } + + DataFrame& operator*=(T scalar) { + for (auto& col : columns_) { + col *= scalar; + } + return *this; + } + + DataFrame& operator/=(T scalar) { + for (auto& col : columns_) { + col /= scalar; + } + return *this; + } + + DataFrame cumsum() const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col.cumsum(); + } + return result; + } + + DataFrame abs() const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col.abs(); + } + return result; + } + + T cov(const std::string& col1, const std::string& col2) const { + auto it1 = column_index_.find(col1); + auto it2 = column_index_.find(col2); + if (it1 == column_index_.end() || it2 == column_index_.end()) { + throw std::invalid_argument("Column not found"); + } + + const auto& series1 = columns_[it1->second]; + const auto& series2 = columns_[it2->second]; + + double mean1 = series1.mean(); + double mean2 = series2.mean(); + double sum = 0; + for (size_t i = 0; i < std::min(series1.size(), series2.size()); ++i) { + sum += (series1[i] - mean1) * (series2[i] - mean2); + } + return sum / (std::min(series1.size(), series2.size()) - 1); + } + + T corr(const std::string& col1, const std::string& col2) const { + T c = this->cov(col1, col2); + double std1 = this->col(col1).std(); + double std2 = this->col(col2).std(); + if (std1 > 0 && std2 > 0) { + return c / (std1 * std2); + } + return 0; + } + + DataFrame describe() const { + DataFrame result; + result.engine_ = engine_; + for (const auto& col : columns_) { + Series stats_col(col.name(), engine_); + stats_col.push_back(col.mean()); + stats_col.push_back(col.std()); + stats_col.push_back(col.min()); + stats_col.push_back(col.max()); + result.columns_.push_back(stats_col); + result.column_names_.push_back(col.name()); + } + result.rows_ = 1; + result.rebuild_index(); + return result; + } + + DataFrame agg(const std::map& ops) const { + DataFrame result; + result.engine_ = engine_; + for (const auto& [col_name, op] : ops) { + auto it = column_index_.find(col_name); + if (it != column_index_.end()) { + const auto& col = columns_[it->second]; + Series result_col(col_name, engine_); + if (op == "mean") { + result_col.push_back(col.mean()); + } else if (op == "sum") { + result_col.push_back(col.sum()); + } else if (op == "min") { + result_col.push_back(col.min()); + } else if (op == "max") { + result_col.push_back(col.max()); + } else if (op == "std") { + result_col.push_back(col.std()); + } + result.columns_.push_back(result_col); + result.column_names_.push_back(col_name); + } + } + result.rows_ = 1; + result.rebuild_index(); + return result; + } + + DataFrame concat(const DataFrame& other) const { + DataFrame result = *this; + for (size_t i = 0; i < other.columns_.size() && i < result.columns_.size(); ++i) { + for (size_t j = 0; j < other.columns_[i].size(); ++j) { + result.columns_[i].push_back(other.columns_[i][j]); + } + } + result.rows_ += other.rows_; + return result; + } + + void info() const { + std::cout << "DataFrame: " << rows_ << " rows x " << cols() << " columns\n"; + std::cout << "Columns:\n"; + for (size_t i = 0; i < column_names_.size(); ++i) { + std::cout << " " << column_names_[i] << " (float64)\n"; + } + std::cout << "Memory usage: " << memory_usage() << " bytes\n"; + } + + size_t memory_usage() const { + size_t total = sizeof(*this); + for (const auto& col : columns_) { + total += col.size() * sizeof(T); + } + return total; + } + + const std::vector& index() const { return index_; } + + void set_index(const std::vector& idx) { index_ = idx; } + + DataFrame reset_index() const { + DataFrame result = *this; + result.index_.resize(result.rows_); + for (size_t i = 0; i < result.rows_; ++i) { + result.index_[i] = std::to_string(i); + } + return result; + } + + DataFrame sample(size_t n = 1, unsigned int seed = 0) const { + std::mt19937 gen(seed); + std::uniform_int_distribution dist(0, rows_ - 1); + std::vector idxs(n); + for (size_t i = 0; i < n; ++i) { + idxs[i] = dist(gen); + } + return iloc(idxs); + } + + DataFrame clip(T lower, T upper) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col.clip(lower, upper); + } + return result; + } + + DataFrame ewm(T alpha) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col.ewm(alpha); + } + return result; + } + + DataFrame rolling(size_t window, bool center = false) const { + DataFrame result = *this; + for (auto& col : result.columns_) { + col = col.rolling(window, center); + } + return result; + } + + DataFrame diff(int periods = 1) const { + DataFrame result; + result.engine_ = engine_; + result.column_names_ = column_names_; + result.rows_ = rows_; + result.index_ = index_; + result.rebuild_index(); + + for (const auto& col : columns_) { + result.columns_.push_back(col.diff(periods)); + } + + return result; + } + + DataFrame round(int decimals = 0) const { + DataFrame result = *this; + T factor = std::pow(10.0, decimals); + for (auto& col : result.columns_) { + for (size_t i = 0; i < col.size(); ++i) { + col[i] = std::round(col[i] * factor) / factor; + } + } + return result; + } + + Series isnull() const { + Series result("isnull", engine_); + for (const auto& col : columns_) { + Series col_null = col.isnull(); + for (size_t i = 0; i < col_null.size(); ++i) { + result.push_back(col_null[i]); + } + } + return result; + } + + void drop_inplace(const std::vector& idxs) { + std::vector keep(rows_, true); + for (size_t idx : idxs) { + if (idx < rows_) { + keep[idx] = false; + } + } + + for (auto& col : columns_) { + std::vector new_data; + for (size_t i = 0; i < col.size(); ++i) { + if (keep[i]) { + new_data.push_back(col[i]); + } + } + col = Series(col.name(), new_data); + } + + rows_ = std::count(keep.begin(), keep.end(), true); + } + + std::shared_ptr> engine() { return engine_; } + +private: + size_t rows_; + std::vector column_names_; + std::vector columns_; + std::vector index_; + std::map column_index_; + std::shared_ptr> engine_; + + void rebuild_index() { + column_index_.clear(); + for (size_t i = 0; i < column_names_.size(); ++i) { + column_index_[column_names_[i]] = i; + } + } +}; + +using DataFrameDouble = DataFrame; + +} // namespace ml diff --git a/src/frame/frame_example.cpp b/src/frame/frame_example.cpp new file mode 100644 index 0000000..54557f3 --- /dev/null +++ b/src/frame/frame_example.cpp @@ -0,0 +1,233 @@ +#include +#include +#include +#include +#include +#include + +int main() { + + ml::DataFrame df; + df.add_column("A", std::vector{1.0, 2.0, 3.0, 4.0, 5.0}); + df.add_column("B", std::vector{10.0, 20.0, 30.0, 40.0, 50.0}); + df.add_column("C", std::vector{100.0, 200.0, 300.0, 400.0, 500.0}); + + std::cout << "=== Original DataFrame ===\n" << df.to_string() << "\n"; + + std::cout << "=== Statistics ===\n"; + std::cout << "Mean of A: " << df.col("A").mean() << "\n"; + std::cout << "Sum of B: " << df.col("B").sum() << "\n"; + std::cout << "Std of C: " << df.col("C").std() << "\n"; + std::cout << "Min of A: " << df.col("A").min() << "\n"; + std::cout << "Max of C: " << df.col("C").max() << "\n"; + std::cout << "Median of B: " << df.col("B").median() << "\n"; + std::cout << "Quantile 25%: " << df.col("A").quantile(0.25) << "\n"; + std::cout << "Quantile 75%: " << df.col("A").quantile(0.75) << "\n"; + + std::cout << "\n=== Operations ===\n"; + auto df2 = df * 2.0; + std::cout << "DataFrame * 2:\n" << df2.to_string() << "\n"; + + auto df3 = df + df2; + std::cout << "DataFrame + DataFrame*2:\n" << df3.to_string() << "\n"; + + std::cout << "\n=== Indexing ===\n"; + std::cout << "Head(3):\n" << df.head(3).to_string() << "\n"; + std::cout << "Tail(2):\n" << df.tail(2).to_string() << "\n"; + + std::cout << "\n=== Sorting ===\n"; + auto sorted_desc = df.sort_values("A", false); + std::cout << "Sorted by A (descending):\n" << sorted_desc.to_string() << "\n"; + + std::cout << "\n=== Stats Module ===\n"; + auto data = df.col("A").to_vector(); + std::cout << "Variance: " << ml::stats::variance(data) << "\n"; + std::cout << "Skewness: " << ml::stats::skewness(data) << "\n"; + std::cout << "Kurtosis: " << ml::stats::kurtosis(data) << "\n"; + std::cout << "Median: " << ml::stats::median(data) << "\n"; + + std::cout << "\n=== Normalization ===\n"; + auto normalized = ml::stats::normalize(data); + std::cout << "Normalized A: ["; + for (size_t i = 0; i < normalized.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << normalized[i]; + } + std::cout << "]\n"; + + std::cout << "\n=== Standardization ===\n"; + auto standardized = ml::stats::standardize(data); + std::cout << "Standardized A: ["; + for (size_t i = 0; i < standardized.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << standardized[i]; + } + std::cout << "]\n"; + + std::cout << "\n=== Rolling Window ===\n"; + auto rolled = df.col("A").rolling(2); + std::cout << "Rolling mean (window=2):\n"; + rolled.print(); + + std::cout << "\n=== Cumulative Operations ===\n"; + auto cumsum = df.col("A").cumsum(); + std::cout << "Cumulative sum of A:\n"; + cumsum.print(); + + auto cummax = df.col("A").cummax(); + std::cout << "Cumulative max of A:\n"; + cummax.print(); + + std::cout << "\n=== Exponentially Weighted ===\n"; + auto ewm = df.col("A").ewm(0.5); + std::cout << "EWM (alpha=0.5):\n"; + ewm.print(); + + std::cout << "\n=== Diff ===\n"; + auto diff = df.col("A").diff(); + std::cout << "Diff of A:\n"; + diff.print(); + + std::cout << "\n=== DataFrame Info ===\n"; + df.info(); + + std::cout << "\n=== ML Module Examples ===\n"; + + ml::DataFrame X_train; + X_train.add_column("Feature1", std::vector{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}); + X_train.add_column("Feature2", std::vector{2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0}); + + ml::DataFrame y_train; + y_train.add_column("Target", std::vector{3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0}); + + std::cout << "\n--- Training Data ---\n"; + std::cout << "X_train:\n" << X_train.to_string() << "\n"; + std::cout << "y_train:\n" << y_train.to_string() << "\n"; + + std::cout << "\n--- Train-Test Split ---\n"; + auto split = ml::ml_ops::train_test_split(X_train, y_train, 0.2, 42); + std::cout << "X_train split: " << split.X_train.rows() << " rows\n"; + std::cout << "X_test split: " << split.X_test.rows() << " rows\n"; + + std::cout << "\n--- Linear Regression ---\n"; + auto [weights, bias] = ml::ml_ops::linear_regression_fit(X_train, y_train.col("Target")); + std::cout << "Weights: ["; + for (size_t i = 0; i < weights.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << weights[i]; + } + std::cout << "]\n"; + std::cout << "Bias: " << bias << "\n"; + + auto predictions = ml::ml_ops::linear_regression_predict(X_train, weights, bias); + std::cout << "Predictions:\n" << predictions.to_string() << "\n"; + + std::cout << "\n--- Evaluation Metrics ---\n"; + ml::Series y_true("Y", std::vector{3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0}); + ml::Series y_pred("Y", std::vector{3.1, 6.2, 8.9, 12.3, 14.8, 17.9, 21.2, 23.8, 27.1, 30.2}); + + double mse = ml::ml_ops::mean_squared_error(y_true, y_pred); + double rmse = ml::ml_ops::root_mean_squared_error(y_true, y_pred); + double mae = ml::ml_ops::mean_absolute_error(y_true, y_pred); + double r2 = ml::ml_ops::r2_score(y_true, y_pred); + + std::cout << "MSE: " << mse << "\n"; + std::cout << "RMSE: " << rmse << "\n"; + std::cout << "MAE: " << mae << "\n"; + std::cout << "R2 Score: " << r2 << "\n"; + + std::cout << "\n--- Min-Max Scaling ---\n"; + auto X_scaled = ml::ml_ops::min_max_scaler(X_train); + std::cout << "Min-Max Scaled:\n" << X_scaled.to_string() << "\n"; + + std::cout << "\n--- Standard Scaling ---\n"; + auto X_standardized = ml::ml_ops::standard_scaler(X_train); + std::cout << "Standardized:\n" << X_standardized.to_string() << "\n"; + + std::cout << "\n--- Polynomial Features (degree=2) ---\n"; + ml::DataFrame X_poly; + X_poly.add_column("F1", std::vector{1.0, 2.0, 3.0, 4.0, 5.0}); + auto poly_features = ml::ml_ops::polynomial_features(X_poly, 2); + std::cout << "Polynomial Features:\n" << poly_features.to_string() << "\n"; + + std::cout << "\n--- Cross Terms ---\n"; + ml::DataFrame X_cross; + X_cross.add_column("A", std::vector{1.0, 2.0, 3.0}); + X_cross.add_column("B", std::vector{4.0, 5.0, 6.0}); + auto cross_terms = ml::ml_ops::cross_terms(X_cross); + std::cout << "Cross Terms:\n" << cross_terms.to_string() << "\n"; + + std::cout << "\n--- K-Means Clustering ---\n"; + std::vector cluster_data = {1.0, 1.5, 2.0, 10.0, 10.5, 11.0, 20.0, 20.5, 21.0}; + auto centroids = ml::ml_ops::k_means_clustering(cluster_data, 3, 100, 42); + std::cout << "K-Means Centroids (k=3): ["; + for (size_t i = 0; i < centroids.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << centroids[i]; + } + std::cout << "]\n"; + + std::cout << "\n--- Label Encoding ---\n"; + std::vector labels = {1.0, 0.0, 1.0, 2.0, 0.0, 1.0}; + auto encoded = ml::ml_ops::label_encode(labels); + std::cout << "Original: [1, 0, 1, 2, 0, 1]\n"; + std::cout << "Encoded: ["; + for (size_t i = 0; i < encoded.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << encoded[i]; + } + std::cout << "]\n"; + + std::cout << "\n--- Add Bias Term ---\n"; + auto X_with_bias = ml::ml_ops::add_bias_term(X_train); + std::cout << "With Bias Column:\n" << X_with_bias.to_string() << "\n"; + + std::cout << "\n--- Logistic Regression ---\n"; + std::vector features = {1.0, 2.0, 3.0}; + std::vector lr_weights = {0.5, -0.3, 0.8}; + double lr_bias = -0.5; + double prob = ml::ml_ops::logistic_regression_predict(features, lr_weights, lr_bias); + std::cout << "Probability: " << prob << "\n"; + + std::cout << "\n--- Classification Metrics ---\n"; + ml::Series y_true_cls("Y", std::vector{0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0}); + ml::Series y_pred_cls("Y", std::vector{0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0}); + double acc = ml::ml_ops::accuracy_score(y_true_cls, y_pred_cls); + std::cout << "Accuracy: " << acc << "\n"; + + std::cout << "\n=== Additional Stats Functions ===\n"; + std::vector sample_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + auto desc = ml::stats::describe(sample_data); + std::cout << "Describe:\n"; + for (const auto& [key, val] : desc) { + std::cout << " " << key << ": " << val << "\n"; + } + + std::cout << "\n=== Covariance & Correlation ===\n"; + std::vector x = {1.0, 2.0, 3.0, 4.0, 5.0}; + std::vector y = {2.0, 4.0, 6.0, 8.0, 10.0}; + std::cout << "Covariance: " << ml::stats::covariance(x, y) << "\n"; + std::cout << "Correlation: " << ml::stats::correlation(x, y) << "\n"; + + std::cout << "\n=== DataFrame Concatenation ===\n"; + ml::DataFrame df_a; + df_a.add_column("X", std::vector{1.0, 2.0, 3.0}); + + ml::DataFrame df_b; + df_b.add_column("X", std::vector{4.0, 5.0, 6.0}); + + auto concatenated = df_a.concat(df_b); + std::cout << "Concatenated:\n" << concatenated.to_string() << "\n"; + + std::cout << "\n=== CSV Output ===\n"; + std::cout << "Writing to /tmp/output.csv...\n"; + df.to_csv("/tmp/output.csv"); + std::cout << "Done! Contents:\n"; + std::ifstream file("/tmp/output.csv"); + std::string line; + while (std::getline(file, line)) { + std::cout << line << "\n"; + } + + return 0; +} diff --git a/src/frame/frame_test.cpp b/src/frame/frame_test.cpp new file mode 100644 index 0000000..8acecd1 --- /dev/null +++ b/src/frame/frame_test.cpp @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ml; + +class DataFrameTest : public ::testing::Test { +protected: + void SetUp() override { + df1.add_column("A", std::vector{1.0, 2.0, 3.0, 4.0, 5.0}); + df1.add_column("B", std::vector{10.0, 20.0, 30.0, 40.0, 50.0}); + } + + DataFrame df1; +}; + +TEST_F(DataFrameTest, ConstructorDefault) { + DataFrame df; + EXPECT_EQ(df.rows(), 0); + EXPECT_EQ(df.cols(), 0); + EXPECT_TRUE(df.empty()); +} + +TEST_F(DataFrameTest, ConstructorWithSize) { + DataFrame df(3, 2); + EXPECT_EQ(df.rows(), 3); + EXPECT_EQ(df.cols(), 2); +} + +TEST_F(DataFrameTest, AddColumn) { + DataFrame df; + df.add_column("X", std::vector{1.0, 2.0, 3.0}); + EXPECT_EQ(df.cols(), 1); + EXPECT_EQ(df.rows(), 3); + EXPECT_TRUE(df.contains("X")); +} + +TEST_F(DataFrameTest, RemoveColumn) { + df1.remove_column("A"); + EXPECT_EQ(df1.cols(), 1); + EXPECT_FALSE(df1.contains("A")); +} + +TEST_F(DataFrameTest, ColAccessor) { + auto& col = df1.col("A"); + EXPECT_EQ(col.size(), 5); + EXPECT_EQ(col[0], 1.0); +} + +TEST_F(DataFrameTest, Head) { + auto head = df1.head(3); + EXPECT_EQ(head.rows(), 3); +} + +TEST_F(DataFrameTest, Tail) { + auto tail = df1.tail(2); + EXPECT_EQ(tail.rows(), 2); +} + +TEST_F(DataFrameTest, IlocSingle) { + auto row = df1.iloc(0); + EXPECT_EQ(row.rows(), 1); +} + +TEST_F(DataFrameTest, IlocRange) { + auto rows = df1.iloc(1, 3); + EXPECT_EQ(rows.rows(), 2); +} + +TEST_F(DataFrameTest, SortValues) { + auto sorted = df1.sort_values("A", false); + EXPECT_EQ(sorted.col("A")[0], 5.0); +} + +TEST_F(DataFrameTest, DropNa) { + DataFrame df; + df.add_column("X", std::vector{1.0, 2.0, 3.0}); + auto dropped = df.dropna(); + EXPECT_EQ(dropped.rows(), 3); +} + +TEST_F(DataFrameTest, FillNa) { + DataFrame df; + df.add_column("X", std::vector{1.0, 2.0, 3.0}); + auto filled = df.fillna(5.0); + EXPECT_EQ(filled.col("X")[0], 1.0); +} + +TEST_F(DataFrameTest, IndexOf) { + auto idx = df1.index_of("A"); + EXPECT_TRUE(idx.has_value()); + EXPECT_EQ(idx.value(), 0); +} + +TEST_F(DataFrameTest, Contains) { + EXPECT_TRUE(df1.contains("A")); + EXPECT_FALSE(df1.contains("Z")); +} + +TEST_F(DataFrameTest, OperatorPlusScalar) { + auto result = df1 + 1.0; + EXPECT_EQ(result.col("A")[0], 2.0); +} + +TEST_F(DataFrameTest, OperatorMinusScalar) { + auto result = df1 - 1.0; + EXPECT_EQ(result.col("A")[0], 0.0); +} + +TEST_F(DataFrameTest, OperatorMultiplyScalar) { + auto result = df1 * 2.0; + EXPECT_EQ(result.col("A")[0], 2.0); +} + +TEST_F(DataFrameTest, OperatorDivideScalar) { + auto result = df1 / 2.0; + EXPECT_EQ(result.col("A")[0], 0.5); +} + +TEST_F(DataFrameTest, OperatorPlusDataFrame) { + auto result = df1 + df1; + EXPECT_EQ(result.col("A")[0], 2.0); +} + +TEST_F(DataFrameTest, Cumsum) { + auto result = df1.cumsum(); + EXPECT_EQ(result.col("A")[0], 1.0); + EXPECT_EQ(result.col("A")[1], 3.0); +} + +TEST_F(DataFrameTest, Diff) { + auto result = df1.diff(); + EXPECT_EQ(result.col("A")[1], 1.0); +} + +TEST_F(DataFrameTest, Describe) { + auto result = df1.describe(); + EXPECT_GT(result.rows(), 0); +} + +TEST_F(DataFrameTest, ToCsv) { + std::ostringstream oss; + df1.to_csv(oss); + std::string output = oss.str(); + EXPECT_NE(output.find("A"), std::string::npos); +} + +TEST_F(DataFrameTest, ToString) { + std::string str = df1.to_string(); + EXPECT_FALSE(str.empty()); +} + +TEST_F(DataFrameTest, Info) { + testing::internal::CaptureStdout(); + df1.info(); + std::string output = testing::internal::GetCapturedStdout(); + EXPECT_FALSE(output.empty()); +} + +TEST_F(DataFrameTest, MemoryUsage) { + size_t mem = df1.memory_usage(); + EXPECT_GT(mem, 0); +} + +class SeriesTest : public ::testing::Test { +protected: + void SetUp() override { + series1 = Series("Test", std::vector{1.0, 2.0, 3.0, 4.0, 5.0}); + } + + Series series1; +}; + +TEST_F(SeriesTest, Constructor) { + EXPECT_EQ(series1.size(), 5); + EXPECT_EQ(series1.name(), "Test"); +} + +TEST_F(SeriesTest, OperatorIndex) { + EXPECT_EQ(series1[0], 1.0); + EXPECT_EQ(series1[4], 5.0); +} + +TEST_F(SeriesTest, Sum) { + EXPECT_EQ(series1.sum(), 15.0); +} + +TEST_F(SeriesTest, Mean) { + EXPECT_EQ(series1.mean(), 3.0); +} + +TEST_F(SeriesTest, Min) { + EXPECT_EQ(series1.min(), 1.0); +} + +TEST_F(SeriesTest, Max) { + EXPECT_EQ(series1.max(), 5.0); +} + +TEST_F(SeriesTest, Std) { + double std_val = series1.std(); + EXPECT_GT(std_val, 0); +} + +TEST_F(SeriesTest, Median) { + EXPECT_EQ(series1.median(), 3.0); +} + +TEST_F(SeriesTest, Quantile) { + EXPECT_EQ(series1.quantile(0.5), 3.0); +} + +TEST_F(SeriesTest, Idxmin) { + EXPECT_EQ(series1.idxmin(), 0); +} + +TEST_F(SeriesTest, Idxmax) { + EXPECT_EQ(series1.idxmax(), 4); +} + +TEST_F(SeriesTest, Head) { + auto result = series1.head(3); + EXPECT_EQ(result.size(), 3); +} + +TEST_F(SeriesTest, Tail) { + auto result = series1.tail(2); + EXPECT_EQ(result.size(), 2); +} + +TEST_F(SeriesTest, Diff) { + auto result = series1.diff(); + EXPECT_EQ(result[1], 1.0); +} + +TEST_F(SeriesTest, Cumsum) { + auto result = series1.cumsum(); + EXPECT_EQ(result[0], 1.0); + EXPECT_EQ(result[1], 3.0); +} + +TEST_F(SeriesTest, OperatorPlusSeries) { + auto result = series1 + series1; + EXPECT_EQ(result[0], 2.0); +} + +TEST_F(SeriesTest, OperatorPlusScalar) { + auto result = series1 + 1.0; + EXPECT_EQ(result[0], 2.0); +} + +TEST_F(SeriesTest, OperatorMinusScalar) { + auto result = series1 - 1.0; + EXPECT_EQ(result[0], 0.0); +} + +TEST_F(SeriesTest, OperatorMultiplyScalar) { + auto result = series1 * 2.0; + EXPECT_EQ(result[0], 2.0); +} + +TEST_F(SeriesTest, OperatorDivideScalar) { + auto result = series1 / 2.0; + EXPECT_EQ(result[0], 0.5); +} + +TEST_F(SeriesTest, SortAscending) { + auto result = series1.sort(true); + EXPECT_EQ(result[0], 1.0); +} + +TEST_F(SeriesTest, SortDescending) { + auto result = series1.sort(false); + EXPECT_EQ(result[0], 5.0); +} + +TEST_F(SeriesTest, ToVector) { + auto vec = series1.to_vector(); + EXPECT_EQ(vec.size(), 5); + EXPECT_EQ(vec[0], 1.0); +} + +TEST_F(SeriesTest, Dot) { + Series other("Other", std::vector{1.0, 2.0, 3.0, 4.0, 5.0}); + EXPECT_EQ(series1.dot(other), 55.0); +} + +class StatsTest : public ::testing::Test { +protected: + std::vector data = {1.0, 2.0, 3.0, 4.0, 5.0}; +}; + +TEST_F(StatsTest, Mean) { + EXPECT_DOUBLE_EQ(stats::mean(data), 3.0); +} + +TEST_F(StatsTest, Variance) { + EXPECT_GT(stats::variance(data), 0); +} + +TEST_F(StatsTest, StdDev) { + EXPECT_GT(stats::std_dev(data), 0); +} + +TEST_F(StatsTest, Median) { + EXPECT_DOUBLE_EQ(stats::median(data), 3.0); +} + +TEST_F(StatsTest, Percentile) { + EXPECT_DOUBLE_EQ(stats::percentile(data, 0.5), 3.0); +} + +TEST_F(StatsTest, Normalize) { + auto result = stats::normalize(data); + EXPECT_EQ(result[0], 0.0); + EXPECT_EQ(result[4], 1.0); +} + +TEST_F(StatsTest, Standardize) { + auto result = stats::standardize(data); + EXPECT_LT(result[0], 0); +} + +TEST_F(StatsTest, Describe) { + auto result = stats::describe(data); + EXPECT_EQ(result["count"], 5.0); + EXPECT_EQ(result["mean"], 3.0); +} + +class MLTest : public ::testing::Test { +protected: + void SetUp() override { + X.add_column("F1", std::vector{1.0, 2.0, 3.0, 4.0, 5.0}); + X.add_column("F2", std::vector{10.0, 20.0, 30.0, 40.0, 50.0}); + y.add_column("Y", std::vector{2.0, 4.0, 6.0, 8.0, 10.0}); + } + + DataFrame X; + DataFrame y; +}; + +TEST_F(MLTest, TrainTestSplit) { + auto split = ml_ops::train_test_split(X, y, 0.2, 42); + EXPECT_GT(split.X_train.rows(), 0); + EXPECT_GT(split.X_test.rows(), 0); +} + +TEST_F(MLTest, MinMaxScaler) { + auto result = ml_ops::min_max_scaler(X); + EXPECT_EQ(result.cols(), X.cols()); +} + +TEST_F(MLTest, StandardScaler) { + auto result = ml_ops::standard_scaler(X); + EXPECT_EQ(result.cols(), X.cols()); +} + +TEST_F(MLTest, LinearRegressionFit) { + auto [weights, bias] = ml_ops::linear_regression_fit(X, y.col("Y")); + EXPECT_EQ(weights.size(), X.cols()); +} + +TEST_F(MLTest, MeanSquaredError) { + Series y_true("Y", std::vector{1.0, 2.0, 3.0}); + Series y_pred("Y", std::vector{1.1, 2.1, 2.9}); + double mse = ml_ops::mean_squared_error(y_true, y_pred); + EXPECT_GT(mse, 0); +} + +TEST_F(MLTest, R2Score) { + Series y_true("Y", std::vector{1.0, 2.0, 3.0}); + Series y_pred("Y", std::vector{1.1, 2.1, 2.9}); + double r2 = ml_ops::r2_score(y_true, y_pred); + EXPECT_LT(r2, 1.0); +} + +TEST_F(MLTest, AccuracyScore) { + Series y_true("Y", std::vector{0.0, 1.0, 0.0, 1.0}); + Series y_pred("Y", std::vector{0.0, 1.0, 1.0, 1.0}); + double acc = ml_ops::accuracy_score(y_true, y_pred); + EXPECT_EQ(acc, 0.75); +} + +class CSVParserTest : public ::testing::Test {}; + +TEST_F(CSVParserTest, ParseLine) { + auto result = CSVParser::parse_line("a,b,c", ','); + EXPECT_EQ(result.size(), 3); + EXPECT_EQ(result[0], "a"); +} + +TEST_F(CSVParserTest, ParseLineWithQuotes) { + auto result = CSVParser::parse_line("\"a,b\",c", ','); + EXPECT_EQ(result.size(), 2); +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/frame/io/csv_parser.h b/src/frame/io/csv_parser.h new file mode 100644 index 0000000..d2a3fbe --- /dev/null +++ b/src/frame/io/csv_parser.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ml { + +class CSVParser { +public: + static std::vector parse_line(const std::string& line, char delimiter = ',') { + std::vector result; + std::string current; + bool in_quotes = false; + + for (size_t i = 0; i < line.size(); ++i) { + char c = line[i]; + + if (c == '"') { + if (in_quotes && i + 1 < line.size() && line[i + 1] == '"') { + current += '"'; + ++i; + } else { + in_quotes = !in_quotes; + } + } else if (c == delimiter && !in_quotes) { + result.push_back(trim(current)); + current.clear(); + } else { + current += c; + } + } + + result.push_back(trim(current)); + return result; + } + + static std::vector> parse(const std::string& filename, char delimiter = ',') { + std::ifstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filename); + } + return parse(file, delimiter); + } + + static std::vector> parse(std::istream& is, char delimiter = ',') { + std::vector> result; + std::string line; + + while (std::getline(is, line)) { + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + if (!line.empty()) { + result.push_back(parse_line(line, delimiter)); + } + } + + return result; + } + + static std::vector> parse_numeric(const std::string& filename, + char delimiter = ',', + size_t skip_rows = 0) { + auto data = parse(filename, delimiter); + std::vector> result; + + size_t start = std::min(skip_rows, data.size()); + for (size_t i = start; i < data.size(); ++i) { + std::vector row; + for (const auto& cell : data[i]) { + try { + row.push_back(std::stod(cell)); + } catch (...) { + row.push_back(std::numeric_limits::quiet_NaN()); + } + } + result.push_back(row); + } + + return result; + } + + static void write(const std::string& filename, + const std::vector& headers, + const std::vector>& data, + char delimiter = ',') { + std::ofstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file for writing: " + filename); + } + + for (size_t i = 0; i < headers.size(); ++i) { + if (i > 0) file << delimiter; + file << headers[i]; + } + file << "\n"; + + for (const auto& row : data) { + for (size_t i = 0; i < row.size(); ++i) { + if (i > 0) file << delimiter; + file << row[i]; + } + file << "\n"; + } + } + + static void write_line(std::ostream& os, + const std::vector& values, + char delimiter = ',') { + for (size_t i = 0; i < values.size(); ++i) { + if (i > 0) os << delimiter; + if (values[i].find(delimiter) != std::string::npos || + values[i].find('"') != std::string::npos) { + os << "\"" << values[i] << "\""; + } else { + os << values[i]; + } + } + } + +private: + static std::string trim(const std::string& s) { + size_t start = 0; + while (start < s.size() && std::isspace(s[start])) ++start; + size_t end = s.size(); + while (end > start && std::isspace(s[end - 1])) --end; + return s.substr(start, end - start); + } +}; + +} // namespace ml diff --git a/src/frame/io/json_parser.h b/src/frame/io/json_parser.h new file mode 100644 index 0000000..2dbd246 --- /dev/null +++ b/src/frame/io/json_parser.h @@ -0,0 +1,266 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ml { + +class CSVParser2 { +public: + static std::vector parse_line(const std::string& line, char delimiter = ',') { + std::vector result; + std::string current; + bool in_quotes = false; + + for (size_t i = 0; i < line.size(); ++i) { + char c = line[i]; + + if (c == '"') { + if (in_quotes && i + 1 < line.size() && line[i + 1] == '"') { + current += '"'; + ++i; + } else { + in_quotes = !in_quotes; + } + } else if (c == delimiter && !in_quotes) { + result.push_back(trim(current)); + current.clear(); + } else { + current += c; + } + } + + result.push_back(trim(current)); + return result; + } + + static std::vector> parse(const std::string& filename, char delimiter = ',') { + std::ifstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filename); + } + return parse(file, delimiter); + } + + static std::vector> parse(std::istream& is, char delimiter = ',') { + std::vector> result; + std::string line; + + while (std::getline(is, line)) { + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + if (!line.empty()) { + result.push_back(parse_line(line, delimiter)); + } + } + + return result; + } + + static std::vector> parse_numeric(const std::string& filename, + char delimiter = ',', + size_t skip_rows = 0) { + auto data = parse(filename, delimiter); + std::vector> result; + + size_t start = std::min(skip_rows, data.size()); + for (size_t i = start; i < data.size(); ++i) { + std::vector row; + for (const auto& cell : data[i]) { + try { + row.push_back(std::stod(cell)); + } catch (...) { + row.push_back(std::nan("")); + } + } + result.push_back(row); + } + + return result; + } + + static void write(const std::string& filename, + const std::vector& headers, + const std::vector>& data, + char delimiter = ',') { + std::ofstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file for writing: " + filename); + } + + for (size_t i = 0; i < headers.size(); ++i) { + if (i > 0) file << delimiter; + file << headers[i]; + } + file << "\n"; + + for (const auto& row : data) { + for (size_t i = 0; i < row.size(); ++i) { + if (i > 0) file << delimiter; + file << row[i]; + } + file << "\n"; + } + } + + static void write_line(std::ostream& os, + const std::vector& values, + char delimiter = ',') { + for (size_t i = 0; i < values.size(); ++i) { + if (i > 0) os << delimiter; + if (values[i].find(delimiter) != std::string::npos || + values[i].find('"') != std::string::npos) { + os << "\"" << values[i] << "\""; + } else { + os << values[i]; + } + } + } + +private: + static std::string trim(const std::string& s) { + size_t start = 0; + while (start < s.size() && std::isspace(s[start])) ++start; + size_t end = s.size(); + while (end > start && std::isspace(s[end - 1])) --end; + return s.substr(start, end - start); + } +}; + +class JSONParser { +public: + using DataMap = std::map>>; + + static DataMap parse_numeric(const std::string& filename) { + std::ifstream file(filename); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filename); + } + + std::string content((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + + return parse_numeric_string(content); + } + + static DataMap parse_numeric(std::istream& is) { + std::string content((std::istreambuf_iterator(is)), + std::istreambuf_iterator()); + return parse_numeric_string(content); + } + +private: + static DataMap parse_numeric_string(const std::string& json) { + DataMap result; + + size_t columns_pos = json.find("\"columns\""); + if (columns_pos != std::string::npos) { + size_t start = json.find("[", columns_pos); + size_t end = json.find("]", start); + if (start != std::string::npos && end != std::string::npos) { + std::string cols_str = json.substr(start + 1, end - start - 1); + std::vector columns = parse_string_array(cols_str); + + size_t data_pos = json.find("\"data\""); + if (data_pos != std::string::npos) { + size_t data_start = json.find("[", data_pos); + size_t data_end = json.rfind("]"); + + for (const auto& col : columns) { + result[col] = {}; + } + + std::string data_str = json.substr(data_start, data_end - data_start); + std::vector rows = parse_array_of_arrays(data_str); + + for (size_t r = 0; r < rows.size(); ++r) { + std::vector values = parse_array(rows[r]); + for (size_t c = 0; c < std::min(values.size(), columns.size()); ++c) { + try { + result[columns[c]].push_back({static_cast(r), std::stod(values[c])}); + } catch (...) { + result[columns[c]].push_back({static_cast(r), std::nan("")}); + } + } + } + } + } + } + + return result; + } + + static std::vector parse_string_array(const std::string& s) { + std::vector result; + std::string current; + bool in_string = false; + + for (size_t i = 0; i < s.size(); ++i) { + char c = s[i]; + if (c == '"' && (i == 0 || s[i-1] != '\\')) { + in_string = !in_string; + } else if (c == ',' && !in_string) { + result.push_back(trim(current)); + current.clear(); + } else { + current += c; + } + } + if (!current.empty()) { + result.push_back(trim(current)); + } + + return result; + } + + static std::vector parse_array(const std::string& s) { + return parse_string_array(s); + } + + static std::vector parse_array_of_arrays(const std::string& s) { + std::vector result; + std::string current; + int depth = 0; + + for (size_t i = 0; i < s.size(); ++i) { + char c = s[i]; + if (c == '[') { + if (depth == 0 && !current.empty()) { + current.clear(); + } + ++depth; + } else if (c == ']') { + --depth; + if (depth == 0) { + result.push_back(current); + current.clear(); + continue; + } + } + current += c; + } + + return result; + } + + static std::string trim(const std::string& s) { + size_t start = 0; + while (start < s.size() && std::isspace(s[start])) ++start; + size_t end = s.size(); + while (end > start && std::isspace(s[end - 1])) --end; + std::string result = s.substr(start, end - start); + if (result.front() == '"' && result.back() == '"' && result.size() > 1) { + result = result.substr(1, result.size() - 2); + } + return result; + } +}; + +} // namespace ml diff --git a/src/frame/ml.h b/src/frame/ml.h new file mode 100644 index 0000000..b3c2e41 --- /dev/null +++ b/src/frame/ml.h @@ -0,0 +1,458 @@ +#pragma once + +#include "frame.h" +#include +#include +#include +#include +#include +#include +#include + +namespace ml { +namespace ml_ops { + +template +struct TrainTestSplit { + DataFrame X_train; + DataFrame X_test; + DataFrame y_train; + DataFrame y_test; +}; + +template +TrainTestSplit train_test_split(const DataFrame& X, const DataFrame& y, + double test_size = 0.2, unsigned int seed = 0) { + std::mt19937 gen(seed); + size_t n_samples = X.rows(); + size_t n_test = static_cast(n_samples * test_size); + size_t n_train = n_samples - n_test; + + std::vector indices(n_samples); + std::iota(indices.begin(), indices.end(), 0); + std::shuffle(indices.begin(), indices.end(), gen); + + std::vector train_indices(indices.begin(), indices.begin() + n_train); + std::vector test_indices(indices.begin() + n_train, indices.end()); + + TrainTestSplit result; + result.X_train = X.iloc(train_indices); + result.X_test = X.iloc(test_indices); + result.y_train = y.iloc(train_indices); + result.y_test = y.iloc(test_indices); + + return result; +} + +template +DataFrame min_max_scaler(const DataFrame& data) { + DataFrame result = data; + for (size_t i = 0; i < result.cols(); ++i) { + auto& col = result.col(i); + T min_val = col.min(); + T max_val = col.max(); + if (max_val != min_val) { + for (size_t j = 0; j < col.size(); ++j) { + col[j] = (col[j] - min_val) / (max_val - min_val); + } + } + } + return result; +} + +template +DataFrame standard_scaler(const DataFrame& data) { + DataFrame result = data; + for (size_t i = 0; i < result.cols(); ++i) { + auto& col = result.col(i); + T mean_val = col.mean(); + T std_val = col.std(); + if (std_val != 0) { + for (size_t j = 0; j < col.size(); ++j) { + col[j] = (col[j] - mean_val) / std_val; + } + } + } + return result; +} + +template +std::vector one_hot_encode(const std::vector& labels) { + std::vector unique_labels = labels; + std::sort(unique_labels.begin(), unique_labels.end()); + unique_labels.erase(std::unique(unique_labels.begin(), unique_labels.end()), unique_labels.end()); + + std::map label_to_idx; + for (size_t i = 0; i < unique_labels.size(); ++i) { + label_to_idx[unique_labels[i]] = i; + } + + std::vector encoded(labels.size() * unique_labels.size(), T(0)); + for (size_t i = 0; i < labels.size(); ++i) { + size_t idx = label_to_idx[labels[i]]; + encoded[i * unique_labels.size() + idx] = T(1); + } + + return encoded; +} + +template +std::vector label_encode(const std::vector& labels) { + std::vector unique_labels = labels; + std::sort(unique_labels.begin(), unique_labels.end()); + unique_labels.erase(std::unique(unique_labels.begin(), unique_labels.end()), unique_labels.end()); + + std::map label_to_idx; + for (size_t i = 0; i < unique_labels.size(); ++i) { + label_to_idx[unique_labels[i]] = i; + } + + std::vector encoded(labels.size()); + for (size_t i = 0; i < labels.size(); ++i) { + encoded[i] = label_to_idx[labels[i]]; + } + + return encoded; +} + +template +DataFrame polynomial_features(const DataFrame& data, int degree = 2) { + DataFrame result = data; + + for (size_t i = 0; i < data.cols(); ++i) { + for (int d = 2; d <= degree; ++d) { + std::vector new_col; + const auto& col = data.col(i); + for (size_t j = 0; j < col.size(); ++j) { + new_col.push_back(std::pow(col[j], d)); + } + result.add_column(data.columns(i) + "_pow" + std::to_string(d), new_col); + } + } + + return result; +} + +template +DataFrame cross_terms(const DataFrame& data) { + DataFrame result = data; + + for (size_t i = 0; i < data.cols(); ++i) { + for (size_t j = i + 1; j < data.cols(); ++j) { + std::vector new_col; + const auto& col1 = data.col(i); + const auto& col2 = data.col(j); + for (size_t k = 0; k < col1.size(); ++k) { + new_col.push_back(col1[k] * col2[k]); + } + result.add_column(data.columns(i) + "_x_" + data.columns(j), new_col); + } + } + + return result; +} + +template +std::vector k_means_clustering(const std::vector& data, size_t k, size_t max_iter = 100, unsigned int seed = 0) { + std::mt19937 gen(seed); + std::uniform_int_distribution dist(0, data.size() - 1); + + std::vector centroids(k); + for (size_t i = 0; i < k; ++i) { + centroids[i] = data[dist(gen)]; + } + + std::vector assignments(data.size()); + + for (size_t iter = 0; iter < max_iter; ++iter) { + for (size_t i = 0; i < data.size(); ++i) { + T min_dist = std::abs(data[i] - centroids[0]); + size_t min_idx = 0; + for (size_t j = 1; j < k; ++j) { + T dist = std::abs(data[i] - centroids[j]); + if (dist < min_dist) { + min_dist = dist; + min_idx = j; + } + } + assignments[i] = min_idx; + } + + std::vector new_centroids(k, T(0)); + std::vector counts(k, 0); + + for (size_t i = 0; i < data.size(); ++i) { + new_centroids[assignments[i]] += data[i]; + counts[assignments[i]]++; + } + + for (size_t i = 0; i < k; ++i) { + if (counts[i] > 0) { + new_centroids[i] /= static_cast(counts[i]); + } + } + + centroids = new_centroids; + } + + return centroids; +} + +template +T linear_regression_predict(const std::vector& X, const std::vector& weights, T bias) { + T prediction = bias; + for (size_t i = 0; i < X.size(); ++i) { + prediction += X[i] * weights[i]; + } + return prediction; +} + +template +std::pair, T> linear_regression_fit(const DataFrame& X, const Series& y, + double learning_rate = 0.01, + size_t n_iterations = 1000) { + size_t n_features = X.cols(); + size_t n_samples = X.rows(); + + std::vector weights(n_features, T(0)); + T bias = T(0); + + for (size_t iter = 0; iter < n_iterations; ++iter) { + std::vector dw(n_features, T(0)); + T db = T(0); + + for (size_t i = 0; i < n_samples; ++i) { + T prediction = bias; + for (size_t j = 0; j < n_features; ++j) { + prediction += X.at(i, j) * weights[j]; + } + T error = prediction - y[i]; + + db += error / static_cast(n_samples); + for (size_t j = 0; j < n_features; ++j) { + dw[j] += error * X.at(i, j) / static_cast(n_samples); + } + } + + bias -= learning_rate * db; + for (size_t j = 0; j < n_features; ++j) { + weights[j] -= learning_rate * dw[j]; + } + } + + return {weights, bias}; +} + +template +T linear_regression_predict_row(const DataFrame& X, size_t row_idx, + const std::vector& weights, T bias); + +template +DataFrame linear_regression_predict(const DataFrame& X, const std::vector& weights, T bias) { + DataFrame predictions; + std::vector pred_values; + + for (size_t i = 0; i < X.rows(); ++i) { + T pred = linear_regression_predict_row(X, i, weights, bias); + pred_values.push_back(pred); + } + + predictions.add_column("prediction", pred_values); + return predictions; +} + +template +T linear_regression_predict_row(const DataFrame& X, size_t row_idx, + const std::vector& weights, T bias) { + T prediction = bias; + for (size_t j = 0; j < X.cols(); ++j) { + prediction += X.at(row_idx, j) * weights[j]; + } + return prediction; +} + +template +T logistic_regression_predict(const std::vector& X, const std::vector& weights, T bias) { + T z = bias; + for (size_t i = 0; i < X.size(); ++i) { + z += X[i] * weights[i]; + } + return T(1) / (T(1) + std::exp(-z)); +} + +template +DataFrame train_test_split_X_y(const DataFrame& X, const Series& y, + double test_size = 0.2, unsigned int seed = 0) { + auto split = train_test_split(X, DataFrame(), test_size, seed); + DataFrame result; + result = split.X_train; + return result; +} + +template +DataFrame k_fold_split(const DataFrame& data, size_t k, size_t fold) { + if (fold >= k) throw std::invalid_argument("Fold index out of range"); + size_t n_samples = data.rows(); + size_t fold_size = n_samples / k; + + size_t start = fold * fold_size; + size_t end = (fold == k - 1) ? n_samples : start + fold_size; + + std::vector test_indices(end - start); + std::iota(test_indices.begin(), test_indices.end(), start); + + std::vector train_indices; + for (size_t i = 0; i < n_samples; ++i) { + if (i < start || i >= end) { + train_indices.push_back(i); + } + } + + DataFrame train_result = data.iloc(train_indices); + return train_result; +} + +template +T mean_squared_error(const Series& y_true, const Series& y_pred) { + if (y_true.size() != y_pred.size()) { + throw std::invalid_argument("Size mismatch"); + } + T sum_sq = T(0); + for (size_t i = 0; i < y_true.size(); ++i) { + T diff = y_true[i] - y_pred[i]; + sum_sq += diff * diff; + } + return sum_sq / static_cast(y_true.size()); +} + +template +T root_mean_squared_error(const Series& y_true, const Series& y_pred) { + return std::sqrt(mean_squared_error(y_true, y_pred)); +} + +template +T mean_absolute_error(const Series& y_true, const Series& y_pred) { + if (y_true.size() != y_pred.size()) { + throw std::invalid_argument("Size mismatch"); + } + T sum_abs = T(0); + for (size_t i = 0; i < y_true.size(); ++i) { + sum_abs += std::abs(y_true[i] - y_pred[i]); + } + return sum_abs / static_cast(y_true.size()); +} + +template +T r2_score(const Series& y_true, const Series& y_pred) { + if (y_true.size() != y_pred.size()) { + throw std::invalid_argument("Size mismatch"); + } + T mean_true = y_true.mean(); + T ss_tot = T(0); + T ss_res = T(0); + for (size_t i = 0; i < y_true.size(); ++i) { + ss_tot += (y_true[i] - mean_true) * (y_true[i] - mean_true); + ss_res += (y_true[i] - y_pred[i]) * (y_true[i] - y_pred[i]); + } + return T(1) - (ss_res / ss_tot); +} + +template +T accuracy_score(const Series& y_true, const Series& y_pred) { + if (y_true.size() != y_pred.size()) { + throw std::invalid_argument("Size mismatch"); + } + size_t correct = 0; + for (size_t i = 0; i < y_true.size(); ++i) { + if (y_true[i] == y_pred[i]) correct++; + } + return static_cast(correct) / static_cast(y_true.size()); +} + +template +std::map classification_report(const Series& y_true, const Series& y_pred) { + std::map report; + report["accuracy"] = accuracy_score(y_true, y_pred); + report["precision"] = accuracy_score(y_true, y_pred); + report["recall"] = accuracy_score(y_true, y_pred); + report["f1"] = accuracy_score(y_true, y_pred); + return report; +} + +template +DataFrame decision_tree_predict(const DataFrame& X, + const std::map& tree, + const std::string& target_column) { + DataFrame predictions; + std::vector pred_values; + + for (size_t i = 0; i < X.rows(); ++i) { + T prediction = tree.at(target_column); + pred_values.push_back(prediction); + } + + predictions.add_column("prediction", pred_values); + return predictions; +} + +template +DataFrame random_forest_predict(const DataFrame& X, + const std::vector>& trees, + const std::string& target_column) { + DataFrame predictions; + std::vector pred_values; + + for (size_t i = 0; i < X.rows(); ++i) { + T sum = T(0); + for (const auto& tree : trees) { + sum += tree.at(target_column); + } + pred_values.push_back(sum / static_cast(trees.size())); + } + + predictions.add_column("prediction", pred_values); + return predictions; +} + +template +DataFrame gradient_boosting_predict(const DataFrame& X, + const std::vector>>& models, + T initial_prediction) { + DataFrame predictions; + std::vector pred_values(X.rows(), initial_prediction); + + for (const auto& [learning_rate, weights] : models) { + for (size_t i = 0; i < X.rows(); ++i) { + T pred = T(0); + for (size_t j = 0; j < weights.size(); ++j) { + pred += X.at(i, j) * weights[j]; + } + pred_values[i] += learning_rate * pred; + } + } + + predictions.add_column("prediction", pred_values); + return predictions; +} + +template +DataFrame normalize_inplace(DataFrame& data) { + return min_max_scaler(data); +} + +template +DataFrame standardize_inplace(DataFrame& data) { + return standard_scaler(data); +} + +template +DataFrame add_bias_term(const DataFrame& data) { + DataFrame result = data; + std::vector ones(data.rows(), T(1)); + result.add_column("bias", ones); + return result; +} + +} // namespace ml_ops + +} // namespace ml diff --git a/src/frame/series.h b/src/frame/series.h new file mode 100644 index 0000000..ebcc3e7 --- /dev/null +++ b/src/frame/series.h @@ -0,0 +1,684 @@ +#pragma once + +#include "../tensor/tensor.h" +#include "../tensor/cpu_engine.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ml { + +template +class Series { +public: + using ValueType = T; + + Series() : name_(""), engine_(std::make_shared>()) {} + + explicit Series(const std::string& name) : name_(name), engine_(std::make_shared>()) {} + + Series(const std::string& name, const std::vector& data) + : name_(name), engine_(std::make_shared>()), data_(data) {} + + Series(const std::string& name, std::shared_ptr> engine) + : name_(name), engine_(engine) {} + + const std::string& name() const { return name_; } + void set_name(const std::string& name) { name_ = name; } + + size_t size() const { return data_.size(); } + bool empty() const { return data_.empty(); } + + const T& operator[](size_t i) const { return data_[i]; } + + template + typename std::enable_if::value, U&>::type + operator[](size_t i) { return data_[i]; } + + T& at(size_t i) { + if (i >= size()) throw std::out_of_range("Series index out of range"); + return data_[i]; + } + const T& at(size_t i) const { + if (i >= size()) throw std::out_of_range("Series index out of range"); + return data_[i]; + } + + std::shared_ptr> engine() { return engine_; } + + void push_back(const T& value) { data_.push_back(value); } + void push_back(T&& value) { data_.push_back(std::move(value)); } + void reserve(size_t n) { data_.reserve(n); } + void resize(size_t n) { data_.resize(n); } + void resize(size_t n, const T& value) { data_.resize(n, value); } + + T sum() const { + if (data_.empty()) return T(0); + return std::accumulate(data_.begin(), data_.end(), T(0)); + } + + T mean() const { return sum() / static_cast(size()); } + + T var() const { + if (size() < 2) return T(0); + T m = mean(); + T sum_sq = T(0); + for (const auto& val : data_) { + T diff = val - m; + sum_sq += diff * diff; + } + return sum_sq / static_cast(size() - 1); + } + + T std() const { return std::sqrt(var()); } + T min() const { return *std::min_element(data_.begin(), data_.end()); } + T max() const { return *std::max_element(data_.begin(), data_.end()); } + + T median() const { + if (empty()) return T(0); + std::vector sorted = data_; + std::sort(sorted.begin(), sorted.end()); + size_t n = sorted.size(); + if (n % 2 == 0) { + return (sorted[n/2 - 1] + sorted[n/2]) / T(2); + } + return sorted[n/2]; + } + + T quantile(T q) const { + if (empty()) return T(0); + if (q < 0 || q > 1) throw std::invalid_argument("Quantile must be between 0 and 1"); + std::vector sorted = data_; + std::sort(sorted.begin(), sorted.end()); + double pos = (sorted.size() - 1) * q; + size_t idx = static_cast(pos); + T frac = static_cast(pos - idx); + if (idx + 1 < sorted.size()) { + return sorted[idx] * (T(1) - frac) + sorted[idx + 1] * frac; + } + return sorted[idx]; + } + + T dot(const Series& other) const { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in dot"); + T result = T(0); + for (size_t i = 0; i < size(); ++i) { + result += data_[i] * other.data_[i]; + } + return result; + } + + size_t idxmin() const { + return std::min_element(data_.begin(), data_.end()) - data_.begin(); + } + + size_t idxmax() const { + return std::max_element(data_.begin(), data_.end()) - data_.begin(); + } + + Series abs() const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = std::abs(data_[i]); + } + return result; + } + + Series sqrt() const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = std::sqrt(data_[i]); + } + return result; + } + + Series exp() const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = std::exp(data_[i]); + } + return result; + } + + Series log() const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = std::log(data_[i]); + } + return result; + } + + Series neg() const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = -data_[i]; + } + return result; + } + + void fill(const T& value) { + std::fill(data_.begin(), data_.end(), value); + } + + void fillna(const T& value) { + for (size_t i = 0; i < data_.size(); ++i) { + if (std::isnan(static_cast(data_[i]))) { + data_[i] = value; + } + } + } + + Series head(size_t n = 5) const { + n = std::min(n, size()); + return Series(name_, std::vector(data_.begin(), data_.begin() + n)); + } + + Series tail(size_t n = 5) const { + n = std::min(n, size()); + return Series(name_, std::vector(data_.end() - n, data_.end())); + } + + Series diff(int periods = 1) const { + if (size() <= static_cast(periods)) { + return Series(name_); + } + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < periods; ++i) { + result.data_[i] = T(0); + } + for (size_t i = periods; i < size(); ++i) { + result.data_[i] = data_[i] - data_[i - periods]; + } + return result; + } + + Series cumsum() const { + Series result(name_); + if (empty()) return result; + result.data_.resize(size()); + result.data_[0] = data_[0]; + for (size_t i = 1; i < size(); ++i) { + result.data_[i] = result.data_[i-1] + data_[i]; + } + return result; + } + + Series cummax() const { + Series result(name_); + if (empty()) return result; + result.data_.resize(size()); + result.data_[0] = data_[0]; + for (size_t i = 1; i < size(); ++i) { + result.data_[i] = std::max(result.data_[i-1], data_[i]); + } + return result; + } + + Series cummin() const { + Series result(name_); + if (empty()) return result; + result.data_.resize(size()); + result.data_[0] = data_[0]; + for (size_t i = 1; i < size(); ++i) { + result.data_[i] = std::min(result.data_[i-1], data_[i]); + } + return result; + } + + Series rolling(size_t window, bool center = false) const { + Series result(name_); + if (size() < window) return result; + result.data_.resize(size()); + for (size_t i = 0; i < window - 1; ++i) { + result.data_[i] = std::numeric_limits::quiet_NaN(); + } + for (size_t i = window - 1; i < size(); ++i) { + size_t start = center ? i - window / 2 : i - window + 1; + T sum_val = T(0); + for (size_t j = 0; j < window; ++j) { + sum_val += data_[start + j]; + } + result.data_[i] = sum_val / static_cast(window); + } + return result; + } + + Series ewm(double alpha = 0.3) const { + Series result(name_); + if (empty()) return result; + result.data_.resize(size()); + result.data_[0] = data_[0]; + for (size_t i = 1; i < size(); ++i) { + result.data_[i] = alpha * data_[i] + (1 - alpha) * result.data_[i-1]; + } + return result; + } + + Series operator+(const Series& other) const { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in +"); + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] + other.data_[i]; + } + return result; + } + + Series operator-(const Series& other) const { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in -"); + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] - other.data_[i]; + } + return result; + } + + Series operator*(const Series& other) const { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in *"); + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] * other.data_[i]; + } + return result; + } + + Series operator/(const Series& other) const { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in /"); + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] / other.data_[i]; + } + return result; + } + + Series operator+(T scalar) const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] + scalar; + } + return result; + } + + Series operator-(T scalar) const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] - scalar; + } + return result; + } + + Series operator*(T scalar) const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] * scalar; + } + return result; + } + + Series operator/(T scalar) const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = data_[i] / scalar; + } + return result; + } + + friend Series operator+(T scalar, const Series& s) { return s + scalar; } + friend Series operator*(T scalar, const Series& s) { return s * scalar; } + + Series& operator+=(const Series& other) { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in +="); + for (size_t i = 0; i < size(); ++i) { + data_[i] += other.data_[i]; + } + return *this; + } + + Series& operator-=(const Series& other) { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in -="); + for (size_t i = 0; i < size(); ++i) { + data_[i] -= other.data_[i]; + } + return *this; + } + + Series& operator*=(const Series& other) { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in *="); + for (size_t i = 0; i < size(); ++i) { + data_[i] *= other.data_[i]; + } + return *this; + } + + Series& operator/=(const Series& other) { + if (size() != other.size()) + throw std::invalid_argument("Series size mismatch in /="); + for (size_t i = 0; i < size(); ++i) { + data_[i] /= other.data_[i]; + } + return *this; + } + + Series& operator+=(T scalar) { + for (size_t i = 0; i < size(); ++i) { + data_[i] += scalar; + } + return *this; + } + + Series& operator-=(T scalar) { + for (size_t i = 0; i < size(); ++i) { + data_[i] -= scalar; + } + return *this; + } + + Series& operator*=(T scalar) { + for (size_t i = 0; i < size(); ++i) { + data_[i] *= scalar; + } + return *this; + } + + Series& operator/=(T scalar) { + for (size_t i = 0; i < size(); ++i) { + data_[i] /= scalar; + } + return *this; + } + + Series& operator++() { + for (size_t i = 0; i < size(); ++i) { + data_[i] += T(1); + } + return *this; + } + + Series operator++(int) { + Series tmp(*this); + ++(*this); + return tmp; + } + + Series& operator--() { + for (size_t i = 0; i < size(); ++i) { + data_[i] -= T(1); + } + return *this; + } + + Series operator--(int) { + Series tmp(*this); + --(*this); + return tmp; + } + + Series operator-() const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = -data_[i]; + } + return result; + } + + Series operator+() const { return *this; } + + Series sort(bool ascending = true) const { + std::vector sorted = data_; + if (ascending) { + std::sort(sorted.begin(), sorted.end()); + } else { + std::sort(sorted.begin(), sorted.end(), std::greater{}); + } + return Series(name_, sorted); + } + + Series rank() const { + std::vector indices(size()); + std::iota(indices.begin(), indices.end(), 0); + std::stable_sort(indices.begin(), indices.end(), + [this](size_t a, size_t b) { return data_[a] < data_[b]; }); + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[indices[i]] = static_cast(i + 1); + } + return result; + } + + size_t count() const { + return data_.size(); + } + + T prod() const { + if (empty()) return T(1); + T result = T(1); + for (const auto& val : data_) { + result *= val; + } + return result; + } + + Series clip(T lower = std::numeric_limits::lowest(), + T upper = std::numeric_limits::max()) const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = std::max(lower, std::min(upper, data_[i])); + } + return result; + } + + std::vector to_vector() const { return data_; } + + void print(std::ostream& os = std::cout) const { + os << name_ << ":\n["; + for (size_t i = 0; i < data_.size(); ++i) { + if (i > 0) os << ", "; + os << data_[i]; + } + os << "]\n"; + } + + Series isnull() const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(std::isnan(static_cast(val))); + } + return result; + } + + Series notnull() const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(!std::isnan(static_cast(val))); + } + return result; + } + + Series filter(const Series& mask) const { + if (size() != mask.size()) + throw std::invalid_argument("Series and mask size mismatch"); + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + if (mask[i]) { + result.push_back(data_[i]); + } + } + return result; + } + + Series dropna() const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + if (!std::isnan(static_cast(data_[i]))) { + result.push_back(data_[i]); + } + } + return result; + } + + template + Series apply(F&& func) const { + Series result(name_); + result.data_.resize(size()); + for (size_t i = 0; i < size(); ++i) { + result.data_[i] = func(data_[i]); + } + return result; + } + + template + Series astype() const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(static_cast(val)); + } + return result; + } + + Series operator<(const Series& other) const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + result.push_back(data_[i] < other.data_[i]); + } + return result; + } + + Series operator>(const Series& other) const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + result.push_back(data_[i] > other.data_[i]); + } + return result; + } + + Series operator<=(const Series& other) const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + result.push_back(data_[i] <= other.data_[i]); + } + return result; + } + + Series operator>=(const Series& other) const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + result.push_back(data_[i] >= other.data_[i]); + } + return result; + } + + Series operator==(const Series& other) const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + result.push_back(data_[i] == other.data_[i]); + } + return result; + } + + Series operator!=(const Series& other) const { + Series result(name_); + for (size_t i = 0; i < size(); ++i) { + result.push_back(data_[i] != other.data_[i]); + } + return result; + } + + Series operator<(T scalar) const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(val < scalar); + } + return result; + } + + Series operator>(T scalar) const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(val > scalar); + } + return result; + } + + Series operator<=(T scalar) const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(val <= scalar); + } + return result; + } + + Series operator>=(T scalar) const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(val >= scalar); + } + return result; + } + + Series operator==(T scalar) const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(val == scalar); + } + return result; + } + + Series operator!=(T scalar) const { + Series result(name_); + for (const auto& val : data_) { + result.push_back(val != scalar); + } + return result; + } + +private: + std::string name_; + std::vector data_; + std::shared_ptr> engine_; +}; + +template +std::ostream& operator<<(std::ostream& os, const Series& s) { + s.print(os); + return os; +} + +using SeriesDouble = Series; +using SeriesFloat = Series; +using SeriesInt = Series; +using SeriesBool = Series; + +} // namespace ml diff --git a/src/frame/stats.h b/src/frame/stats.h new file mode 100644 index 0000000..9596dd5 --- /dev/null +++ b/src/frame/stats.h @@ -0,0 +1,275 @@ +#pragma once + +#include "frame.h" +#include +#include +#include +#include +#include +#include +#include + +namespace ml { + +namespace stats { + +template +T mean(const std::vector& data) { + if (data.empty()) return T(0); + return std::accumulate(data.begin(), data.end(), T(0)) / static_cast(data.size()); +} + +template +T variance(const std::vector& data) { + if (data.size() < 2) return T(0); + T m = mean(data); + T sum_sq = T(0); + for (const auto& val : data) { + T diff = val - m; + sum_sq += diff * diff; + } + return sum_sq / static_cast(data.size() - 1); +} + +template +T std_dev(const std::vector& data) { + return std::sqrt(variance(data)); +} + +template +T median(std::vector data) { + if (data.empty()) return T(0); + std::sort(data.begin(), data.end()); + size_t n = data.size(); + if (n % 2 == 0) { + return (data[n/2 - 1] + data[n/2]) / T(2); + } + return data[n/2]; +} + +template +T percentile(std::vector data, T p) { + if (data.empty()) return T(0); + if (p < 0 || p > 1) throw std::invalid_argument("Percentile must be between 0 and 1"); + std::sort(data.begin(), data.end()); + double pos = (data.size() - 1) * p; + size_t idx = static_cast(pos); + T frac = static_cast(pos - idx); + if (idx + 1 < data.size()) { + return data[idx] * (T(1) - frac) + data[idx + 1] * frac; + } + return data[idx]; +} + +template +T covariance(const std::vector& x, const std::vector& y) { + if (x.size() != y.size() || x.size() < 2) return T(0); + T mean_x = mean(x); + T mean_y = mean(y); + T sum = T(0); + for (size_t i = 0; i < x.size(); ++i) { + sum += (x[i] - mean_x) * (y[i] - mean_y); + } + return sum / static_cast(x.size() - 1); +} + +template +T correlation(const std::vector& x, const std::vector& y) { + T cov = covariance(x, y); + T std_x = std_dev(x); + T std_y = std_dev(y); + if (std_x == 0 || std_y == 0) return T(0); + return cov / (std_x * std_y); +} + +template +T skewness(const std::vector& data) { + if (data.size() < 3) return T(0); + T m = mean(data); + T s = std_dev(data); + if (s == 0) return T(0); + T sum_cubed = T(0); + for (const auto& val : data) { + T diff = (val - m) / s; + sum_cubed += diff * diff * diff; + } + T n = static_cast(data.size()); + return (n / ((n - 1) * (n - 2))) * sum_cubed; +} + +template +T kurtosis(const std::vector& data) { + if (data.size() < 4) return T(0); + T m = mean(data); + T s = std_dev(data); + if (s == 0) return T(0); + T sum_quartile = T(0); + for (const auto& val : data) { + T diff = (val - m) / s; + sum_quartile += diff * diff * diff * diff; + } + T n = static_cast(data.size()); + T coef = n * (n + 1) / ((n - 1) * (n - 2) * (n - 3)); + T subtract = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3)); + return coef * sum_quartile - subtract; +} + +template +std::map describe(const std::vector& data) { + std::map result; + result["count"] = static_cast(data.size()); + result["mean"] = mean(data); + result["std"] = std_dev(data); + result["min"] = *std::min_element(data.begin(), data.end()); + result["max"] = *std::max_element(data.begin(), data.end()); + result["25%"] = percentile(data, T(0.25)); + result["50%"] = percentile(data, T(0.50)); + result["75%"] = percentile(data, T(0.75)); + return result; +} + +template +std::vector normalize(const std::vector& data) { + if (data.empty()) return {}; + T min_val = *std::min_element(data.begin(), data.end()); + T max_val = *std::max_element(data.begin(), data.end()); + if (max_val == min_val) return std::vector(data.size(), T(0)); + std::vector result(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + result[i] = (data[i] - min_val) / (max_val - min_val); + } + return result; +} + +template +std::vector standardize(const std::vector& data) { + T m = mean(data); + T s = std_dev(data); + if (s == 0) return std::vector(data.size(), T(0)); + std::vector result(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + result[i] = (data[i] - m) / s; + } + return result; +} + +template +std::vector log_transform(const std::vector& data) { + std::vector result(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + result[i] = std::log(data[i] + T(1)); + } + return result; +} + +template +std::vector sqrt_transform(const std::vector& data) { + std::vector result(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + result[i] = std::sqrt(std::abs(data[i])); + } + return result; +} + +template +std::vector box_cox_transform(const std::vector& data, T lambda) { + std::vector result(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + if (lambda == 0) { + result[i] = std::log(data[i]); + } else { + result[i] = (std::pow(data[i], lambda) - 1) / lambda; + } + } + return result; +} + +template +T z_score(const T& value, const std::vector& data) { + T m = mean(data); + T s = std_dev(data); + if (s == 0) return T(0); + return (value - m) / s; +} + +template +std::vector argsort(const std::vector& data, bool ascending = true) { + std::vector indices(data.size()); + std::iota(indices.begin(), indices.end(), 0); + if (ascending) { + std::stable_sort(indices.begin(), indices.end(), + [&data](size_t a, size_t b) { return data[a] < data[b]; }); + } else { + std::stable_sort(indices.begin(), indices.end(), + [&data](size_t a, size_t b) { return data[a] > data[b]; }); + } + return indices; +} + +template +std::vector bootstrap(const std::vector& data, size_t n_iterations, unsigned int seed = 0) { + std::mt19937 gen(seed); + std::uniform_int_distribution dist(0, data.size() - 1); + std::vector means; + for (size_t i = 0; i < n_iterations; ++i) { + std::vector sample; + for (size_t j = 0; j < data.size(); ++j) { + sample.push_back(data[dist(gen)]); + } + means.push_back(mean(sample)); + } + return means; +} + +template +T confidence_interval(const std::vector& data, T confidence = 0.95) { + T m = mean(data); + T se = std_dev(data) / std::sqrt(static_cast(data.size())); + T critical = percentile(bootstrap(data, 1000), T(1 - (1 - confidence) / 2)); + return critical * se; +} + +template +std::vector> covariance_matrix(const std::vector>& data) { + size_t n = data.size(); + std::vector> result(n, std::vector(n)); + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + result[i][j] = covariance(data[i], data[j]); + } + } + return result; +} + +template +std::vector> correlation_matrix(const std::vector>& data) { + size_t n = data.size(); + std::vector> result(n, std::vector(n)); + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + result[i][j] = correlation(data[i], data[j]); + } + } + return result; +} + +template +DataFrame describe_dataframe(const DataFrame& df) { + DataFrame result; + for (size_t i = 0; i < df.cols(); ++i) { + const auto& col = df.col(i); + auto stats = describe(col.to_vector()); + std::vector stat_names; + std::vector stat_values; + for (const auto& [key, val] : stats) { + stat_names.push_back(key); + stat_values.push_back(val); + } + result.add_column("column_" + std::to_string(i), stat_values); + } + return result; +} + +} // namespace stats + +} // namespace ml