代码之家  ›  专栏  ›  技术社区  ›  Felipe Alvarenga

快速评估R中多边形列表内点的方法

  •  1
  • Felipe Alvarenga  · 技术社区  · 6 年前

    我有两个数据集,一个包含1300多万个矩形多边形(4个lat-lng点),另一个包含10000个点(指该位置的价格)。

    > polygons
         id                                 pol_lat                                 pol_lng
     1: 148 -4.250236,-4.250236,-4.254640,-4.254640 -49.94628,-49.94494,-49.94494,-49.94628
     2: 149 -4.254640,-4.254640,-5.361601,-5.361601 -49.94494,-49.07906,-49.07906,-49.94494
     3: 150 -5.361601,-5.361601,-5.212208,-5.212208 -49.07906,-49.04469,-49.04469,-49.07906
     4: 151 -5.212208,-5.212208,-5.002878,-5.002878 -49.04469,-48.48664,-48.48664,-49.04469
     5: 152 -5.002878,-5.002878,-5.080018,-5.080018 -48.48664,-48.43699,-48.43699,-48.48664
     6: 153 -5.080018,-5.080018,-5.079819,-5.079819 -48.43699,-48.42480,-48.42480,-48.43699
     7: 154 -5.079819,-5.079819,-5.155606,-5.155606 -48.42480,-47.53891,-47.53891,-48.42480
     8: 155 -5.155606,-5.155606,-4.954156,-4.954156 -47.53891,-47.50354,-47.50354,-47.53891
     9: 156 -4.954156,-4.954156,-3.675864,-3.675864 -47.50354,-45.39022,-45.39022,-47.50354
    10: 157 -3.675864,-3.675864,-3.706356,-3.706356 -45.39022,-45.30724,-45.30724,-45.39022
    11: 158 -3.706356,-3.706356,-3.705801,-3.705801 -45.30724,-45.30722,-45.30722,-45.30724
    > points
        longitude  latitude  price
     1: -47.50308 -4.953936 3.0616
     2: -47.50308 -4.953936 3.2070
     3: -47.50308 -4.953936 3.0630
     4: -47.50308 -4.953936 3.0603
     5: -47.50308 -4.953936 3.0460
     6: -47.50308 -4.953936 2.9900
     7: -49.07035 -5.283658 3.3130
     8: -49.08054 -5.347284 3.3900
     9: -49.08054 -5.347284 3.3620
    10: -49.21726 -5.338270 3.3900
    11: -49.08050 -5.347255 3.4000
    12: -49.08042 -5.347248 3.3220
    13: -49.08190 -5.359508 3.3130
    14: -49.08046 -5.347277 3.3560
    

    我想在所有适合每个多边形的点中为每个多边形生成一个平均价格。

    现在我正在使用 sp::point.in.polygon 得到适合于给定多边形的所有点的索引,然后得到其平均价格。

    w <- lapply(1:nrow(polygons),
                function(tt) {
                  ind <- point.in.polygon(points$latitude, points$longitude,
                                          polygons$pol_lat[[tt]], polygons$pol_lng[[tt]]) > 0
                  med <- mean(points$price[ind])
                  return(med)
                }
    )
    > unlist(w)
     [1]      NaN 3.361857 3.313000      NaN      NaN      NaN      NaN      NaN 3.071317      NaN      NaN
    

    然而,这显然是缓慢的。关于如何更快地完成它的任何想法,也许使用 data.table dplyr (或其他方式)?

    数据如下

    > dput(polygons)
    structure(list(id = 148:158, pol_lat = list(c(-4.2502356, -4.2502356, 
    -4.2546403, -4.2546403), c(-4.2546403, -4.2546403, -5.3616014, 
    -5.3616014), c(-5.3616014, -5.3616014, -5.2122078, -5.2122078
    ), c(-5.2122078, -5.2122078, -5.0028781, -5.0028781), c(-5.0028781, 
    -5.0028781, -5.0800181, -5.0800181), c(-5.0800181, -5.0800181, 
    -5.0798186, -5.0798186), c(-5.0798186, -5.0798186, -5.1556063, 
    -5.1556063), c(-5.1556063, -5.1556063, -4.9541564, -4.9541564
    ), c(-4.9541564, -4.9541564, -3.6758637, -3.6758637), c(-3.6758637, 
    -3.6758637, -3.706356, -3.706356), c(-3.706356, -3.706356, -3.7058011, 
    -3.7058011)), pol_lng = list(c(-49.9462826, -49.9449427, -49.9449427, 
    -49.9462826), c(-49.9449427, -49.0790599, -49.0790599, -49.9449427
    ), c(-49.0790599, -49.0446868, -49.0446868, -49.0790599), c(-49.0446868, 
    -48.4866355, -48.4866355, -49.0446868), c(-48.4866355, -48.436988, 
    -48.436988, -48.4866355), c(-48.436988, -48.4247989, -48.4247989, 
    -48.436988), c(-48.4247989, -47.5389072, -47.5389072, -48.4247989
    ), c(-47.5389072, -47.5035404, -47.5035404, -47.5389072), c(-47.5035404, 
    -45.3902168, -45.3902168, -47.5035404), c(-45.3902168, -45.3072392, 
    -45.3072392, -45.3902168), c(-45.3072392, -45.3072216, -45.3072216, 
    -45.3072392))), row.names = c(NA, -11L), class = c("data.table", 
    "data.frame"), .internal.selfref = <pointer: 0x00000000025e1ef0>)
    > dput(points)
    structure(list(longitude = c(-47.5030772, -47.5030772, -47.5030772, 
    -47.5030772, -47.5030772, -47.5030772, -49.0703469, -49.0805422, 
    -49.0805422, -49.217259, -49.0804978, -49.0804181, -49.0818997, 
    -49.0804625), latitude = c(-4.9539357, -4.9539357, -4.9539357, 
    -4.9539357, -4.9539357, -4.9539357, -5.283658, -5.3472839, -5.3472839, 
    -5.3382696, -5.3472551, -5.347248, -5.3595084, -5.3472768), price = c(3.0616, 
    3.207, 3.063, 3.0603, 3.046, 2.99, 3.313, 3.39, 3.362, 3.39, 
    3.4, 3.322, 3.313, 3.356)), row.names = c(NA, -14L), class = c("data.table", 
    "data.frame"), .internal.selfref = <pointer: 0x00000000025e1ef0>)
    
    1 回复  |  直到 6 年前
        1
  •  1
  •   lbusett Guest    6 年前

    如果你的“多边形”总是矩形 如示例中所示,一种可能性是使用在包中实现的四叉树空间索引。 SearchTrees ,以提高识别每个多边形中哪个点的速度。

    由于空间索引所允许的“比较”数量越少,它可以给您带来相当大的速度提升,空间索引所允许的“比较”数量越大, 在数据集中。

    例如:

    library(SearchTrees)
    library(magrittr)
    
    # Create a "beefier" test dataset based on your data: 14000 pts 
    # over 45000 polygons
    
    for (i in 1:10) points   <- rbind(points, points + runif(length(points)))
    for (i in 1:12) polygons <- rbind(polygons, polygons)
    
    
    # Compute limits of the polygons
    min_xs <- lapply(polygons$pol_lng , min) %>% unlist()
    max_xs <- lapply(polygons$pol_lng , max) %>% unlist()
    min_ys <- lapply(polygons$pol_lat , min) %>% unlist()
    max_ys <- lapply(polygons$pol_lat, max) %>% unlist()
    xlims <- cbind(min_xs, max_xs)
    ylims <- cbind(min_ys, max_ys)
    
    # Create the quadtree
    tree = SearchTrees::createTree(cbind(points[1],points[2]))
    
    #☻ extract averages, looping over polygons ----
    t1 <- Sys.time()
    w <- lapply(1:nrow(polygons), 
                function(tt) {
                  ind <- SearchTrees::rectLookup(
                    tree, 
                    xlims = xlims[tt,],
                    ylims = ylims[tt,]))
                  mean(points$price[ind])
    
                  })
    Sys.time() - t1
    

    时差2.945789秒

    w1 <- unlist(w)
    

    在我的旧笔记本电脑上,这种“幼稚”的实现比您对测试数据的原始方法快10倍多:

    t1 <- Sys.time()
    w <- lapply(1:nrow(polygons),
                function(tt) {
                  ind <- sp::point.in.polygon(points$latitude, points$longitude,
                                          polygons$pol_lat[[tt]], polygons$pol_lng[[tt]]) > 0
                  med <- mean(points$price[ind])
                  return(med)
                }
    )
    Sys.time() - t1
    w2 <- unlist(w)
    

    时差40.36493秒

    ,结果相同:

    > all.equal(w1, w2)
    [1] TRUE
    

    整体速度的提高将取决于您的点在空间范围内是如何“聚集”的,以及关于多边形的。

    如果多边形不是矩形,也可以考虑使用这种方法,首先提取每个多边形的bbox中包含的点,然后使用更标准的方法找到多边形“内部”的点。

    另外,考虑到该任务是严格并行的,因此您可以通过使用 foreach parlapply 接近多边形。

    嗯!

    推荐文章