From eb5d3bf8c5718541d42b99b008e84a18b146232a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@higgsboson.tk>
Date: Wed, 18 May 2016 14:29:40 +0000
Subject: [PATCH] add notes regarding real world gotchas

---
 README.md  | 39 ++++++++++++++++++++++++++++++++++++---
 example.py |  2 +-
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4b8a5e3..18c8513 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,47 @@
-## k-Shape
+# k-Shape
 
 Python implementation of [k-Shape](http://www.cs.columbia.edu/~jopa/kshape.html),
 a new fast and accurate unsupervised Time Series cluster algorithm
 
-### Usage
+## Usage
 
 ```
 from kshape import kshape, zscore
 
-time_series = [[1,2,3,4], [0,1,2,3], [-1,1,-1,1], [1,2,2,3]]
+time_series = [[1,2,3,4], [0,1,2,3], [0,1,2,3], [1,2,2,3]]
 cluster_num = 2
 clusters = kshape(zscore(time_series), cluster_num)
+#=> [(array([-0.42860026, -1.15025211,  1.38751707, -0.42860026,  0.61993557]), [3]),
+#    (array([-1.56839539, -0.40686255,  0.84042433,  0.67778452,  0.45704908]), [0, 1, 2])]
+```
+
+Returns list of tuples with the clusters found by kshape. The first value of the
+tuple is zscore normalized centroid. The second value of the tuple is the index
+of assigned series to this cluster.
+The results can be examined by drawing graphs of the zscore normalized values
+n/aand the corresponding centroid.
+
+## Gotchas when working with real-world time series
+
+- If the data is available from different sources with same frequency but at different points in time, it needs to be aligned.
+- In the following a tab seperated file is assumed, where each column is a different observation;
+  gapps in columns happen, when only a certain value at this point in time was obtained.
+
+```
+import pandas as pd
+# assuming the time series are stored in a tab seperated file, where `time` is
+# the name of the column containing the timestamp
+df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True)
+df = df.fillna(method="bfill", limit=1e9)
+# drop rows with the same time stamp
+df = df.groupby(level=0).first()
+```
+
+- kshape also expect no time series with a constant observation value or 'n/a'
+
+```
+time_series = []
+for f in df.columns:
+  if not df[f].isnull().any() and df[f].var() != 0:
+    time_series.append[df[f]]
 ```
diff --git a/example.py b/example.py
index f3a7dfc..0486d61 100644
--- a/example.py
+++ b/example.py
@@ -1,6 +1,6 @@
 from kshape import kshape, zscore
 
-time_series = [[1,2,3,4], [0,1,2,3], [-1,1,-1,1], [1,2,2,3]]
+time_series = [[1,2,3,4,5], [0,1,2,3,4], [3,2,1,0,-1], [1,2,2,3,3]]
 cluster_num = 2
 clusters = kshape(zscore(time_series), cluster_num)
 print(clusters)