From 7b93a93e5f519bc4f7411101cd6b587437f74b8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Wed, 25 May 2016 10:28:12 +0000 Subject: [PATCH] README.md: better downsampling code --- README.md | 9 ++++++--- kshape.py | 2 -- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 83decaa..46abaac 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,12 @@ import pandas as pd # assuming the time series are stored in a tab seperated file, where `time` is # the name of the column containing the timestamp df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True) -df = df.fillna(method="bfill", limit=1e9) -# drop rows with the same time stamp -df = df.groupby(level=0).first() +# use a meaningful sample size depending on how the frequency of your time series: +# Higher is more accurate, but if series gets too long, the calculation gets cpu and memory intensive. +# Keeping the length below 2000 values is usually a good idea. +df = df.resample("500ms").mean() +df.interpolate(method="time", limit_direction="both", inplace=True) +df.fillna(method="bfill", inplace=True) ``` - kshape also expect no time series with a constant observation value or 'n/a' diff --git a/kshape.py b/kshape.py index b72c2a3..9eb5d68 100644 --- a/kshape.py +++ b/kshape.py @@ -7,8 +7,6 @@ from numpy.linalg import norm from numpy.fft import fft, ifft -#from scipy.linalg import eigh - def zscore(a, axis=0, ddof=0): a = np.asanyarray(a) mns = a.mean(axis=axis)