代码之家  ›  专栏  ›  技术社区  ›  user8834780

突出显示图中的异常值

  •  -1
  • user8834780  · 技术社区  · 6 年前

    对于以下异常值检测功能:

    days = df['days'].dropna()
    print(days.to_string())
    
    1       350.0
    2       641.0
    5       389.0
    6       130.0
    9       344.0
    16       92.0
    21      392.0
    24       51.0
    25       28.0
    28      358.0
    31      309.0
    34       64.0
    35      380.0
    36      491.0
    44      332.0
    46      410.0
    52       66.0
    54      435.0
    58      156.0
    59      294.0
    60       75.0
    63      284.0
    64      105.0
    68       34.0
    69       50.0
    75      155.0
    77      427.0
    78      327.0
    81      116.0
    87       97.0
    88      274.0
    89      315.0
    93       99.0
    95       70.0
    103      62.0
    106     241.0
    108     397.0
    110      50.0
    112      41.0
    115     231.0
    116     238.0
    117     216.0
    126     105.0
    140      36.0
    141     192.0
    144      38.0
    147     122.0
    150      37.0
    159     236.0
    163     175.0
    169     138.0
    179     146.0
    202     125.0
    208     144.0
    210     166.0
    221      19.0
    240     155.0
    242     130.0
    255      54.0
    264     120.0
    270      65.0
    271      95.0
    275     158.0
    280      92.0
    301      65.0
    313      52.0
    318      91.0
    329      67.0
    332      38.0
    333      72.0
    357      36.0
    393      14.0
    399      74.0
    402     155.0
    409     503.0
    411     110.0
    412     338.0
    428     444.0
    438     408.0
    439     107.0
    448     214.0
    449     291.0
    454      91.0
    455     277.0
    461      96.0
    462     325.0
    463     154.0
    465     314.0
    468     377.0
    470     147.0
    471      48.0
    482     224.0
    486      75.0
    490     268.0
    500     135.0
    502     177.0
    508     133.0
    509     306.0
    510     187.0
    515     145.0
    520     353.0
    521     148.0
    539     182.0
    545      95.0
    547      82.0
    548      64.0
    552     143.0
    557      79.0
    567     168.0
    582     141.0
    585     224.0
    598      82.0
    617     202.0
    635     107.0
    637     169.0
    639     153.0
    659     156.0
    660      79.0
    666      49.0
    679     126.0
    687      44.0
    694      67.0
    704      64.0
    708     102.0
    721      74.0
    807      56.0
    810     102.0
    814     285.0
    817     386.0
    826     176.0
    833     106.0
    838       6.0
    842     322.0
    844      72.0
    847     192.0
    848     429.0
    855     101.0
    856     159.0
    867     168.0
    872     319.0
    874     178.0
    880     323.0
    881     295.0
    886     151.0
    887     286.0
    889      93.0
    891     336.0
    901     252.0
    903     111.0
    904      49.0
    905     113.0
    915     214.0
    926     230.0
    960      77.0
    962     192.0
    964     219.0
    979     166.0
    981      72.0
    989     143.0
    999     166.0
    1022    140.0
    1023    191.0
    1060    113.0
    1061     83.0
    1063     41.0
    1070     28.0
    1085     84.0
    1105     78.0
    1119     28.0
    1147    202.0
    1149    223.0
    1157    188.0
    1160    238.0
    1161    212.0
    1162    133.0
    1164    235.0
    1172    212.0
    1175    243.0
    1184    176.0
    1195    167.0
    1250     69.0
    1251    108.0
    1301     11.0
    1306     35.0
    1310     63.0
    1323     38.0
    1390    111.0
    1391    135.0
    1401    143.0
    1426     70.0
    1434    143.0
    
    def outliers_iqr(ys):
        quartile_1, quartile_3 = np.percentile(ys, [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (iqr * 1.5)
        upper_bound = quartile_3 + (iqr * 1.5)
        return np.where(ys < lower_bound), np.where(ys > upper_bound)
    
    outliers = outliers_iqr(days)   
    print(outliers)
    

    我得到以下信息:

    ((array([], dtype=int64),), (array([ 1, 13, 74]),))
    

    我知道返回了两个数组,一个是值的下界(在我们的例子中为空),另一个是值的上界。所以这里没有小于下界的离群值,但是有三个大于上界的离群值。

    如果我 print("Count Outliers: " + str(len(days.where(days>upper_bound).dropna()))) 我得到491503和641(上界是460)。如图所示,打印的数组给出了1,13,74,我假设它是序列中异常值的位置。

    但是,我如何使用它,以便:

    percentiles= np.array([25,50,75])
    x_p = np.percentile(days, percentiles)
    y_p = percentiles/100.0
    _ = plt.plot(x_p, y_p, marker='D', color='red', linestyle='none')
    ax=plt.gca() 
    if len(outliers)>0:
        ax.fill_betweenx(y_p[0], outliers, x_p[0], where= outliers<x_p[0], facecolor='red', alpha=0.3)
        ax.fill_betweenx(y_p[2], x_p[2], outliers, where= outliers>x_p[2], facecolor='red', alpha=0.3)
    

    我得到一个valueerror错误: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() 在ax.fill_Betweenx行中,可能是由于多个“异常值”。

    1 回复  |  直到 6 年前
        1
  •  1
  •   user8834780    6 年前

    下面尝试使您的代码具有可复制性:

    import numpy as np
    import matplotlib.pyplot as plt
    
    # note: it is much better than a printout of the variable, you can 
    #       actually use this data in *days* variable to work with your example
    
    days = [350.0, 641.0, 389.0, 130.0, 344.0, 92.0, 392.0, 51.0, 28.0, 358.0, 
            309.0, 64.0, 380.0, 491.0, 332.0, 410.0, 66.0, 435.0, 156.0, 294.0, 
            75.0, 284.0, 105.0, 34.0, 50.0, 155.0, 427.0, 327.0, 116.0, 97.0, 
            274.0, 315.0, 99.0, 70.0, 62.0, 241.0, 397.0, 50.0, 41.0, 231.0, 
            238.0, 216.0, 105.0, 36.0, 192.0, 38.0, 122.0, 37.0, 236.0, 175.0, 
            138.0, 146.0, 125.0, 144.0, 166.0, 19.0, 155.0, 130.0, 54.0, 120.0, 
            65.0, 95.0, 158.0, 92.0, 65.0, 52.0, 91.0, 67.0, 38.0, 72.0, 36.0, 
            14.0, 74.0, 155.0, 503.0, 110.0, 338.0, 444.0, 408.0, 107.0, 214.0, 
            291.0, 91.0, 277.0, 96.0, 325.0, 154.0, 314.0, 377.0, 147.0, 48.0, 
            224.0, 75.0, 268.0, 135.0, 177.0, 133.0, 306.0, 187.0, 145.0, 353.0, 
            148.0, 182.0, 95.0, 82.0, 64.0, 143.0, 79.0, 168.0, 141.0, 224.0, 82.0,
            202.0, 107.0, 169.0, 153.0, 156.0, 79.0, 49.0, 126.0, 44.0, 67.0, 64.0, 
            102.0, 74.0, 56.0, 102.0, 285.0, 386.0, 176.0, 106.0, 6.0, 322.0, 72.0, 
            192.0, 429.0, 101.0, 159.0, 168.0, 319.0, 178.0, 323.0, 295.0, 151.0, 
            286.0, 93.0, 336.0, 252.0, 111.0, 49.0, 113.0, 214.0, 230.0, 77.0,
            192.0, 219.0, 166.0, 72.0, 143.0, 166.0, 140.0, 191.0, 113.0, 83.0, 
            41.0, 28.0, 84.0, 78.0, 28.0, 202.0, 223.0, 188.0, 238.0, 212.0, 133.0, 
            235.0, 212.0, 243.0, 176.0, 167.0, 69.0, 108.0, 11.0, 35.0, 63.0, 38.0,
            111.0, 135.0, 143.0, 70.0, 143.0]
    
    def get_bounds(ys):
        quartile_1, quartile_3 = np.percentile(ys, [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (iqr * 1.5)
        upper_bound = quartile_3 + (iqr * 1.5)
        return lower_bound, upper_bound
    
    def get_upper_outliers(ys):
        lower_bound, upper_bound = get_bounds(days)
        return [y for y in ys if y >= upper_bound]
    
    def get_lower_outliers(ys):
        lower_bound, upper_bound = get_bounds(days)
        return [y for y in ys if y <= lower_bound]
    
    max_outliers = get_upper_outliers(days) 
    min_outliers = get_lower_outliers(days) 
    
    assert max_outliers== [641.0, 491.0, 503.0]
    assert min_outliers == []
    
    percentiles = np.array([0, 25, 50, 75, 100])
    x_p = np.percentile(days, percentiles)
    y_p = percentiles
    _ = plt.plot(x_p, y_p, marker='D', color='red', linestyle='none')
    ax=plt.gca() 
    
    # my approximation of  ax.fill_betweenx does not look right, but it is close
    # note ax.fill_betweenx and ax.fill_between are different fucntions!
    
    if min_outliers: 
        ax.fill_betweenx(y_p, 0, np.max(min_outliers), facecolor='red', alpha=0.3) 
    
    if max_outliers: 
        ax.fill_betweenx(y_p, np.min(max_outliers), x_p[4], facecolor='red', alpha=0.3)