From eb5d3bf8c5718541d42b99b008e84a18b146232a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Wed, 18 May 2016 14:29:40 +0000 Subject: [PATCH] add notes regarding real world gotchas --- README.md | 39 ++++++++++++++++++++++++++++++++++++--- example.py | 2 +- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4b8a5e3..18c8513 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,47 @@ -## k-Shape +# k-Shape Python implementation of [k-Shape](http://www.cs.columbia.edu/~jopa/kshape.html), a new fast and accurate unsupervised Time Series cluster algorithm -### Usage +## Usage ``` from kshape import kshape, zscore -time_series = [[1,2,3,4], [0,1,2,3], [-1,1,-1,1], [1,2,2,3]] +time_series = [[1,2,3,4], [0,1,2,3], [0,1,2,3], [1,2,2,3]] cluster_num = 2 clusters = kshape(zscore(time_series), cluster_num) +#=> [(array([-0.42860026, -1.15025211, 1.38751707, -0.42860026, 0.61993557]), [3]), +# (array([-1.56839539, -0.40686255, 0.84042433, 0.67778452, 0.45704908]), [0, 1, 2])] +``` + +Returns list of tuples with the clusters found by kshape. The first value of the +tuple is zscore normalized centroid. The second value of the tuple is the index +of assigned series to this cluster. +The results can be examined by drawing graphs of the zscore normalized values +n/aand the corresponding centroid. + +## Gotchas when working with real-world time series + +- If the data is available from different sources with same frequency but at different points in time, it needs to be aligned. +- In the following a tab seperated file is assumed, where each column is a different observation; + gapps in columns happen, when only a certain value at this point in time was obtained. + +``` +import pandas as pd +# assuming the time series are stored in a tab seperated file, where `time` is +# the name of the column containing the timestamp +df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True) +df = df.fillna(method="bfill", limit=1e9) +# drop rows with the same time stamp +df = df.groupby(level=0).first() +``` + +- kshape also expect no time series with a constant observation value or 'n/a' + +``` +time_series = [] +for f in df.columns: + if not df[f].isnull().any() and df[f].var() != 0: + time_series.append[df[f]] ``` diff --git a/example.py b/example.py index f3a7dfc..0486d61 100644 --- a/example.py +++ b/example.py @@ -1,6 +1,6 @@ from kshape import kshape, zscore -time_series = [[1,2,3,4], [0,1,2,3], [-1,1,-1,1], [1,2,2,3]] +time_series = [[1,2,3,4,5], [0,1,2,3,4], [3,2,1,0,-1], [1,2,2,3,3]] cluster_num = 2 clusters = kshape(zscore(time_series), cluster_num) print(clusters)