From 07a31c71c819b2129f7eb47a8c2056de94143e0d Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Thu, 12 Sep 2024 10:40:15 -0600 Subject: [PATCH] Fixes #414 - categorical maps are integers now by default --- src/tech/v3/dataset/categorical.clj | 6 +++--- test/data/local_date.json | 4 ++++ test/tech/v3/dataset/categorical_test.clj | 10 ++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 test/data/local_date.json diff --git a/src/tech/v3/dataset/categorical.clj b/src/tech/v3/dataset/categorical.clj index 0f9075b2..a09d1612 100644 --- a/src/tech/v3/dataset/categorical.clj +++ b/src/tech/v3/dataset/categorical.clj @@ -104,7 +104,7 @@ Non integers found: " (vec bad-mappings))))) m (set/unique (ds-base/column dataset colname))) colname - (or res-dtype :float64)))) + (or res-dtype :int64)))) @@ -112,7 +112,7 @@ Non integers found: " (vec bad-mappings))))) "Apply a categorical mapping transformation fit with fit-categorical-map." [dataset fit-data] (let [colname (:src-column fit-data) - result-datatype (or (:result-datatype fit-data) :float64) + result-datatype (or (:result-datatype fit-data) :int64) lookup-table (:lookup-table fit-data) column (ds-base/column dataset colname) missing (ds-proto/missing column) @@ -231,7 +231,7 @@ user> (ds-cat/dataset->categorical-maps catds) dataset (dissoc dataset src-column) n-elems (dtype/ecount column) op-space (casting/simple-operation-space (dtype-proto/operational-elemwise-datatype column))] - (merge dataset + (merge dataset (->> one-hot-table (lznc/map (fn [[k v]] diff --git a/test/data/local_date.json b/test/data/local_date.json new file mode 100644 index 00000000..1a550e62 --- /dev/null +++ b/test/data/local_date.json @@ -0,0 +1,4 @@ +[ + {"test": 1, "time-period": "2024-06-20"}, + {"test": 2, "time-period": "2024-06-21"}, + {"test": 3, "time-period": "2024-06-22"}] \ No newline at end of file diff --git a/test/tech/v3/dataset/categorical_test.clj b/test/tech/v3/dataset/categorical_test.clj index 507f157f..851ea030 100644 --- a/test/tech/v3/dataset/categorical_test.clj +++ b/test/tech/v3/dataset/categorical_test.clj @@ -77,3 +77,13 @@ (dtype/emap val-map :keyword col)))) (ds/categorical->number cf/categorical) (ds/column "Survived"))))) +(deftest categorical-assignments-are-integers + (is (= #{0 1 2 3} + (-> + (ds/->dataset {:x1 [1 2 4 5 6 5 6 7] + :x2 [5 6 6 7 8 2 4 6] + :y [:a :b :b :a :c :a :b :d]}) + (ds/categorical->number [:y]) + (get :y) + distinct + set))))