2 回答

TA贡献1863条经验 获得超2个赞
一个选项可能是计算时间戳的拟合度,并修改那些偏离拟合度大于特定阈值的时间戳。例子:
import pandas as pd
import numpy as np
start = pd.Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = pd.Timestamp(d.year, d.day, d.month)
else:
muddler[d] = pd.Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# convert date col to posix timestamp
df['ts'] = df['Date'].values.astype(np.float) / 10**9
# calculate a linear fit for ts col
x = np.linspace(df['ts'].iloc[0], df['ts'].iloc[-1], df['ts'].size)
df['ts_linfit'] = np.polyval(np.polyfit(x, df['ts'], 1), x)
# set a thresh and derive a mask that masks differences between
# fit and timestamp greater than thresh:
thresh = 1.2e6 # you might want to tweak this...
m = np.absolute(df['ts']-df['ts_linfit']) > thresh
# create new date col as copy of original
df['Date_filtered'] = df['Date']
# modify values that were caught in the mask
df.loc[m, 'Date_filtered'] = df['Date_filtered'][m].apply(lambda x: pd.Timestamp(x.year, x.day, x.month))
# also to posix timestamp
df['ts_filtered'] = df['Date_filtered'].values.astype(np.float) / 10**9
ax = df['ts'].plot(label='original')
ax = df['ts_filtered'].plot(label='filtered')
ax.legend()

TA贡献2065条经验 获得超14个赞
在尝试创建一个最小的可重现示例时,我实际上已经解决了我的问题——但我希望有一种更有效的方法来做我想做的事情……
# i first define a function to examine the dates
def disordered_muddle(date_series, future_first=True):
"""Check whether a series of dates is disordered or just muddled"""
disordered = []
muddle = []
dates = date_series
different_dates = pd.Series(dates.unique())
date = different_dates[0]
for i, d in enumerate(different_dates[1:]):
# we expect the date's dayofyear to decrease by one
if d.dayofyear!=date.dayofyear-1:
# unless the year is changing
if d.year!=date.year-1:
try:
# we check if the day and month are muddled
# if d.day > 12 this will cause an Exception
unmuddle = Timestamp(d.year,d.day,d.month)
if unmuddle.dayofyear==date.dayofyear-1:
muddle.append(d)
d = unmuddle
elif unmuddle.year==date.year-1:
muddle.append(d)
d = unmuddle
else:
disordered.append(d)
except:
disordered.append(d)
date=d
if len(disordered)==0 and len(muddle)==0:
return False
else:
return disordered, muddle
disorder, muddle = disordered_muddle(df['Date'])
# finally unmuddle the dates
date_correction = {}
for d in df['Date']:
if d in muddle:
date_correction[d] = Timestamp(d.year, d.day, d.month)
else:
date_correction[d] = Timestamp(d.year, d.month, d.day)
df['CorrectedDate'] = df['Date'].map(date_correction)
disordered_muddle(df['CorrectedDate'])
添加回答
举报