From 042d5ecf4ece770ad407ff1cb0991317fa68d5c8 Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Sat, 21 Aug 2021 16:51:05 -0600 Subject: [PATCH] Updated system to include row-map and re-released. --- deps.edn | 4 +- docs/index.html | 4 +- docs/tech.v3.dataset.html | 54 ++++++++++++++++++++------- docs/tech.v3.datatype.argops.html | 2 +- docs/tech.v3.datatype.functional.html | 2 +- docs/tech.v3.datatype.html | 2 +- docs/tech.v3.libs.cljs-ajax.html | 2 +- src/tech/v3/dataset.cljs | 39 +++++++++++++++++++ 8 files changed, 88 insertions(+), 21 deletions(-) diff --git a/deps.edn b/deps.edn index 83204be..e3dd23c 100644 --- a/deps.edn +++ b/deps.edn @@ -1,7 +1,7 @@ {:paths ["src"] ;;We serialize datasets to transit-json :deps {com.cognitect/transit-cljs {:mvn/version "0.8.269"} - techascent/tech.ml.dataset {:mvn/version "6.015"} + techascent/tech.ml.dataset {:mvn/version "6.016"} tick/tick {:mvn/version "0.4.24-alpha"} com.cognitect/transit-clj {:mvn/version "1.0.324"}} :aliases @@ -21,7 +21,7 @@ :exec-fn hf.depstar/jar :exec-args {:group-id "com.cnuernber" :artifact-id "tmdjs" - :version "1.000-beta-8" + :version "1.000-beta-9" :sync-pom true :jar "target/tmdjs.jar"}} ;;deploy to clojars diff --git a/docs/index.html b/docs/index.html index 22a8ed8..61fce79 100644 --- a/docs/index.html +++ b/docs/index.html @@ -1,9 +1,9 @@ - 1.000-beta-7

1.000-beta-7

Dataframe processing for ClojureScript.

Namespaces

tech.v3.dataset

Dataframe (map of columns) data processing system for clojurescript. + 1.000-beta-9

1.000-beta-9

Dataframe processing for ClojureScript.

Namespaces

tech.v3.datatype.argops

Index-space algorithms. Implements a subset of the jvm-version.

diff --git a/docs/tech.v3.dataset.html b/docs/tech.v3.dataset.html index 9300dce..8bbf332 100644 --- a/docs/tech.v3.dataset.html +++ b/docs/tech.v3.dataset.html @@ -1,6 +1,6 @@ -tech.v3.dataset documentation

tech.v3.dataset

Dataframe (map of columns) data processing system for clojurescript. +tech.v3.dataset documentation

tech.v3.dataset

Dataframe (map of columns) data processing system for clojurescript. This API is a simplified version of the jvm-version's api.

Datasets are maps of columns so assoc will add a new column and dissoc @@ -46,7 +46,7 @@

column

(column ds k)

Return the column at positing k. Failing to find the column is an error.

column->data

(column->data col)

Transform a column in raw data safe for passing to transit or edn.

-

column-count

(column-count ds)

Integer column count of the dataset.

+

column-count

(column-count ds)

Integer column count of the dataset.

column-map

(column-map dataset result-colname map-fn res-dtype-or-opts filter-fn-or-ds)(column-map dataset result-colname map-fn filter-fn-or-ds)(column-map dataset result-colname map-fn)

Produce a new (or updated) column as the result of mapping a fn over columns.

  • dataset - dataset.
  • @@ -150,11 +150,11 @@

concat

(concat ds & args)

This is a copying concatenation so the result will be realized. Missing columns will be filled in with missing values.

data->column

(data->column {:keys [metadata missing data]})

Transform data produced via column->data into a column

-

data->dataset

(data->dataset ds-data)

Given data produced via dataset->data create a new dataset.

-

dataset->data

(dataset->data ds)

Convert a dataset into a pure data datastructure save for transit or direct json +

data->dataset

(data->dataset ds-data)

Given data produced via dataset->data create a new dataset.

+

dataset->data

(dataset->data ds)

Convert a dataset into a pure data datastructure save for transit or direct json serialization. Uses base64 encoding of numeric data.

-

dataset->transit-str

(dataset->transit-str ds & [format handlers])

Write a transit string adding in the dataset write handler

-

dataset?

(dataset? ds)

Return true of this is a dataset.

+

dataset->transit-str

(dataset->transit-str ds & [format handlers])

Write a transit string adding in the dataset write handler

+

dataset?

(dataset? ds)

Return true of this is a dataset.

filter

(filter ds pred)

Filter the dataset. Pred gets passed each row as a map.

filter-column

(filter-column ds colname & [pred])

Filter the dataset by column colname. If pred isn't passed in the column's values are treated as truthy.

@@ -171,7 +171,7 @@

group-by-column

(group-by-column ds colname)

Group the dataset by column colname

head

(head ds n)(head ds)

Return the first n rows of the dataset.

intersect-missing-sets

(intersect-missing-sets col-seq)

Intersect the missing sets of the columns

-

merge-by-column

(merge-by-column lhs rhs colname)

Merge rows assuming left, right have the same columns. Left is taken first then +

merge-by-column

(merge-by-column lhs rhs colname)

Merge rows assuming left, right have the same columns. Left is taken first then any right not appear with left are appended. This is far less general but much faster than a join operation; it is useful for merging timeseries data.

missing

(missing ds-or-col)

Return the missing set as a clojure set. The underlying protocol returns @@ -191,7 +191,35 @@ missing both arguments may be nil.

row-at

(row-at ds idx)

Get row as a map at index idx. Negative indexes index from the end.

row-count

(row-count ds-or-col)

Integer row count of the dataset.

-

rows

(rows ds)

Get a sequence of maps from a dataset

+

row-map

(row-map ds map-fn & [options])

Map a function across the rows of the dataset producing a new dataset +that is merged back into the original potentially replacing existing columns. +Options are passed into the ->dataset function so you can control the resulting +column types by the usual dataset parsing options described there.

+

Examples:

+
cljs.user> (def stocks (ds/transit-file->dataset "test/data/stocks.transit-json"))
+#'cljs.user/stocks
+cljs.user> (ds/head stocks)
+#dataset[https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]
+| :symbol |      :date | :price |
+|---------|------------|-------:|
+|    MSFT | 2000-01-01 |  39.81 |
+|    MSFT | 2000-02-01 |  36.35 |
+|    MSFT | 2000-03-01 |  43.22 |
+|    MSFT | 2000-04-01 |  28.37 |
+|    MSFT | 2000-05-01 |  25.45 |]
+cljs.user> (ds/head (ds/row-map stocks (fn [row]
+                                    {:symbol (keyword (row :symbol))
+                                     :price2 (* (row :price)(row :price))})))
+#dataset[https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 4]
+| :symbol |      :date | :price |       :price2 |
+|---------|------------|-------:|--------------:|
+|   :MSFT | 2000-01-01 |  39.81 | 1584.83610000 |
+|   :MSFT | 2000-02-01 |  36.35 | 1321.32250000 |
+|   :MSFT | 2000-03-01 |  43.22 | 1867.96840000 |
+|   :MSFT | 2000-04-01 |  28.37 |  804.85690000 |
+|   :MSFT | 2000-05-01 |  25.45 |  647.70250000 |]
+
+

rows

(rows ds)

Get a sequence of maps from a dataset

rowvec-at

(rowvec-at ds idx)

Get row as a vec of values at index idx. Negative indexes index from the end.

rowvecs

(rowvecs ds)

Get a sequence of persistent vectors from a dataset

select

(select ds cols rows)

Select a subrect of the dataset.

@@ -202,11 +230,11 @@

sort-by-column

(sort-by-column ds colname & [sort-op])

Sort the dataset by column colname

tail

(tail ds n)(tail ds)

Return the last n rows of the dataset.

transit-file->dataset

(transit-file->dataset fname)

Given a file of transit data return a dataset. This only works on Node.

-

transit-read-handler-map

(transit-read-handler-map)

Return a map mapping the dataset tag to a transit read handler.

-

transit-str->dataset

(transit-str->dataset json-data & [format handlers])

Parse a transit string adding in the dataset read handler

-

transit-write-handler-map

(transit-write-handler-map)

Return a map mapping the dataset type to a transit writer handler.

-

union-missing-sets

(union-missing-sets col-seq)

Union the missing sets of the columns

-

unique-by

(unique-by ds f)

Unique-by taking first

+

transit-read-handler-map

(transit-read-handler-map)

Return a map mapping the dataset tag to a transit read handler.

+

transit-str->dataset

(transit-str->dataset json-data & [format handlers])

Parse a transit string adding in the dataset read handler

+

transit-write-handler-map

(transit-write-handler-map)

Return a map mapping the dataset type to a transit writer handler.

+

union-missing-sets

(union-missing-sets col-seq)

Union the missing sets of the columns

+

unique-by

(unique-by ds f)

Unique-by taking first

unique-by-column

(unique-by-column ds colname)

Unique-by taking first

update

(update lhs-ds filter-fn-or-ds update-fn & args)

Update this dataset. Filters this dataset into a new dataset, applies update-fn, then merges the result into original dataset.

diff --git a/docs/tech.v3.datatype.argops.html b/docs/tech.v3.datatype.argops.html index 9e8e44d..ea42564 100644 --- a/docs/tech.v3.datatype.argops.html +++ b/docs/tech.v3.datatype.argops.html @@ -1,6 +1,6 @@ -tech.v3.datatype.argops documentation

tech.v3.datatype.argops

Index-space algorithms. Implements a subset of the jvm-version.

+tech.v3.datatype.argops documentation

tech.v3.datatype.argops

Index-space algorithms. Implements a subset of the jvm-version.

argfilter

(argfilter pred data)(argfilter data)

Return an array of indexes that pass the filter.

arggroup

(arggroup data)

Return a map from value->indexes that hold that value.

arglast-every

(arglast-every rdr pred)

Return the last index where (pred (rdr idx) (rdr (dec idx))) was true by diff --git a/docs/tech.v3.datatype.functional.html b/docs/tech.v3.datatype.functional.html index 90e6c44..5b01b45 100644 --- a/docs/tech.v3.datatype.functional.html +++ b/docs/tech.v3.datatype.functional.html @@ -1,6 +1,6 @@ -tech.v3.datatype.functional documentation

tech.v3.datatype.functional

Simple math primitives.

+tech.v3.datatype.functional documentation

tech.v3.datatype.functional

Simple math primitives.

descriptive-statistics

(descriptive-statistics stats v)

Given a sequence of desired stats return a map of statname->value.

Example:

cljs.user> (dfn/descriptive-statistics [:min :max :mean :n-values] (range 10))
diff --git a/docs/tech.v3.datatype.html b/docs/tech.v3.datatype.html
index efbd1f2..59bc7bf 100644
--- a/docs/tech.v3.datatype.html
+++ b/docs/tech.v3.datatype.html
@@ -1,6 +1,6 @@
 
-tech.v3.datatype documentation

tech.v3.datatype

Support for programming with arrays and a fast set implementation for indexe (int32) values. +tech.v3.datatype documentation

tech.v3.datatype

Support for programming with arrays and a fast set implementation for indexe (int32) values. For complex/higher order algorithms see tech.v3.datatype.argops. For mathematical primitives, see tech.v3.datatype.functional

->js-set

(->js-set)(->js-set data)

Create a javascript set. These have superior performance when dealing with numeric diff --git a/docs/tech.v3.libs.cljs-ajax.html b/docs/tech.v3.libs.cljs-ajax.html index 7145bab..4c44e2f 100644 --- a/docs/tech.v3.libs.cljs-ajax.html +++ b/docs/tech.v3.libs.cljs-ajax.html @@ -1,6 +1,6 @@ -tech.v3.libs.cljs-ajax documentation

tech.v3.libs.cljs-ajax

Bindings to use the dataset handlers in cljs GET/POST calls.

+tech.v3.libs.cljs-ajax documentation

tech.v3.libs.cljs-ajax

Bindings to use the dataset handlers in cljs GET/POST calls.

GET

(GET url options)

Drop in replacement for cljs-ajax.core/GET

opt-map

Options map that must be included in the cljs-ajax request in order to activate dataset->transit pathways.

diff --git a/src/tech/v3/dataset.cljs b/src/tech/v3/dataset.cljs index 61f4f9c..2026d92 100644 --- a/src/tech/v3/dataset.cljs +++ b/src/tech/v3/dataset.cljs @@ -722,6 +722,7 @@ user> (ds/missing (*1 :c)) (column-map dataset result-colname map-fn nil (column-names dataset)))) + (defn union-missing-sets "Union the missing sets of the columns" [col-seq] @@ -734,6 +735,44 @@ user> (ds/missing (*1 :c)) (reduce dtype/set-and (map ds-proto/-missing col-seq))) +(defn row-map + "Map a function across the rows of the dataset producing a new dataset + that is merged back into the original potentially replacing existing columns. + Options are passed into the [[->dataset]] function so you can control the resulting + column types by the usual dataset parsing options described there. + + Examples: + +```clojure +cljs.user> (def stocks (ds/transit-file->dataset \"test/data/stocks.transit-json\")) +#'cljs.user/stocks +cljs.user> (ds/head stocks) +#dataset[https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3] +| :symbol | :date | :price | +|---------|------------|-------:| +| MSFT | 2000-01-01 | 39.81 | +| MSFT | 2000-02-01 | 36.35 | +| MSFT | 2000-03-01 | 43.22 | +| MSFT | 2000-04-01 | 28.37 | +| MSFT | 2000-05-01 | 25.45 |] +cljs.user> (ds/head (ds/row-map stocks (fn [row] + {:symbol (keyword (row :symbol)) + :price2 (* (row :price)(row :price))}))) +#dataset[https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 4] +| :symbol | :date | :price | :price2 | +|---------|------------|-------:|--------------:| +| :MSFT | 2000-01-01 | 39.81 | 1584.83610000 | +| :MSFT | 2000-02-01 | 36.35 | 1321.32250000 | +| :MSFT | 2000-03-01 | 43.22 | 1867.96840000 | +| :MSFT | 2000-04-01 | 28.37 | 804.85690000 | +| :MSFT | 2000-05-01 | 25.45 | 647.70250000 |] +```" + [ds map-fn & [options]] + (merge ds (->> (rows ds) + (dtype/emap map-fn :object) + (->>dataset options)))) + + (defn- numeric-data->b64 [data] (let [data (clone data)