代码之家  ›  专栏  ›  技术社区  ›  Alex Gao


  •  3
  • Alex Gao  · 技术社区  · 6 年前


    name  value 
    'foo' 2
    'bar' 4
    'bar' 3
    'foo' 1
      .   .
      .   .
      .   .
    'bar' 8



    funcs = {'foo': <FunctionObject>, 'bar': <FunctionObject>}
    def masterFunc(row):
        correctFunction = funcs[row['name']]
        row['output'] = correctFunction(row['value']) + 3*row['value']
    df.apply(masterFunc, axis=1).


    separateFunc = scipy.interpolate.interp1d(x-coords=[2, 3, 4], y-coords=[3, 5, 7])
    #separateFunc is now a math function, y=2x-1. use case:
    y = separateFunc(3.5) # y == 6



    interpolationFunctions = {}
    #the 'interpolate.emissionsFunctions' are a separate function which does some scipy stuff
    interpolationFunctions[2] = interpolate.emissionsFunctions('./roadtype_2_curve.csv')
    interpolationFunctions[3] = interpolate.emissionsFunctions('./roadtype_3_curve.csv')
    def compute_pollutants(row):
        funcs = interpolationFunctions[row['roadtype']]
        speed = row['speed']
        length = row['length']
        row['CO2-Atm'] = funcs['CO2-Atm'](speed)*length*speed*0.00310686368
        row['CO2-Eq'] = funcs['CO2-Eq'](speed)*length*speed*0.00310686368
        return row
    1 回复  |  直到 6 年前
  •  1
  •   RPyStats    6 年前


    import numpy as np
    import pandas as pd
    import time as t
    # Example Functions
    def foo(x):
        return x + x
    def bar(x):
        return x * x
    # Example Functions for multiple columns
    def foo2(x, y):
        return x + y
    def bar2(x, y):
        return x * y
    # Create function dictionary
    funcs = {'foo': foo, 'bar': bar}
    funcs2 = {'foo': foo2, 'bar': bar2}
    n_rows = 1000000
    # Generate Sample Data
    names = np.random.choice(list(funcs.keys()), size=n_rows)
    values = np.random.normal(100, 20, size=n_rows)
    df = pd.DataFrame()
    df['name'] = names
    df['value'] = values
    # Create copy for comparison using different methods
    df_copy = df.copy()
    # Modified original master function
    def masterFunc(row, functs):
        correctFunction = funcs[row['name']]
        return correctFunction(row['value']) + 3*row['value']
    t1 = t.time()
    df['output'] = df.apply(lambda x: masterFunc(x, funcs), axis=1)
    t2 = t.time()
    print("Time for all rows/functions: ", t2 - t1)
    # For Functions that Can be vectorized using numpy
    t3 = t.time()
    output_dataframe_list = []
    for func_name, func in funcs.items():
        df_subset = df_copy.loc[df_copy['name'] == func_name,:]
        df_subset['output'] = func(df_subset['value'].values) + 3 * df_subset['value'].values
    output_df = pd.concat(output_dataframe_list)
    t4 = t.time()
    print("Time for all rows/functions: ", t4 - t3)
    # Using a for loop over numpy array of values is still faster than dataframe apply using
    t5 = t.time()
    output_dataframe_list2 = []
    for func_name, func in funcs2.items():
        df_subset = df_copy.loc[df_copy['name'] == func_name,:]
        col1_values = df_subset['value'].values
        outputs = np.zeros(len(col1_values))
        for i, v in enumerate(col1_values):
            outputs[i] = func(col1_values[i], col1_values[i]) + 3 * col1_values[i]
        df_subset['output'] = np.array(outputs)
    output_df2 = pd.concat(output_dataframe_list2)
    t6 = t.time()
    print("Time for all rows/functions: ", t6 - t5)