0

I web-scrapped multiple store locators, and now I want to find all the brands that each store has. As every brand has slightly different coordinates, I combined the data by creating a custom key (rounding lat and long to 4 decimals and adding the first 5 letters of the store name) but this has obvious limitations and is not good enough for my case.

My next approach should be perfect, but I am stuck. I thought to:

  1. Create circles of 100m around each point (st_transform and st_buffer based on this answer)
  2. Check if any of the circles with the same first 5 letters intersect (... group_by(name_key) %>% st_overlaps) and somehow get a unique group_id.
  3. Combine the data to using the group_id as key

Hope somebody can give me a hand because I am totally stuck.

tt <- tibble(lat  = c(41.38702918, 41.386601, 41.38744179, 41.3871449, 41.38896353, 41.38963478, 41.38666041, 41.38598465, 41.3867169, 41.3852132453, 41.38438262),
         long = c(2.170086301, 2.16939207, 2.17080332, 2.171450558, 2.167685415, 2.16892721, 2.170950285, 2.171100009, 2.171736392, 2.171215346, 2.170955304),
         name = c( "El Corte Inglés", "EL CORTE INGLÉS DIAGONAL (007)", "El Corte Inglés", "El Corte Inglés Barcelona", "Nadons", "Pops And Co", "Bitti", "BITTI", "BITTI", "Bitti", "Bitti"),
         brand = c( "b", "s", "c", "e", "e", "c", "m", "c", "s", "e", "b") ) %>% 
        mutate(name_key = tolower(name),
               name_key = iconv(name_key, from="UTF-8",to="ASCII//TRANSLIT"),
               name_key = str_remove_all(name_key, '[[:digit:]]'),
               name_key = str_remove_all(name_key, '[:punct:]'), 
               name_key = str_remove_all(name_key, '[:space:]'),
               # Remove the common company types from the name
               name_key = str_remove_all(name_key, 'ltda'),
               name_key = str_remove_all(name_key, 'ltd'),
               # Only get the first 5 letters
               name_key = str_sub(name_key, end = 5L))

leaflet() %>%
  # add different provider tiles
  addProviderTiles(
    "OpenStreetMap",
    # give the layer a name
    group = "OpenStreetMap") %>% 
  addCircleMarkers(data = tt,
                   radius = 4,
                   opacity = 0.7,
                   label = paste(
                     "Store name: ", tt$name, "<br>",
                     "name_id: ", tt$name_key,"<br>",
                     "Brand: ", tt$brand) %>%
                     lapply(htmltools::HTML) ) 
Xavier
  • 129
  • 7

3 Answers3

1

This seems to do the trick. With this, you can do all the data wrangling and group_by() summarization you want

library(sf)
library(tidyverse)

tt <- tibble(lat = c(41.436921, 41.4329208, 41.3829246, 41.398841, 41.3908955, 41.3790881, 41.3876734, 41.4091834, 41.3819518, 41.3878469, 41.3941022, 41.390988469863, 41.3917335, 41.3987172, 41.43623025, 41.3868139, 41.4329513, 41.387896, 41.3913932, 41.3876928, 41.401127, 41.3826847, 41.385063, 41.408131, 41.38855346, 41.39145716, 41.3883035, 41.387676, 41.3856535, 41.39719374, 41.4408444, 41.432961, 41.4331833, 41.4083721, 41.37511991, 41.3925928, 41.4361573, 41.401058, 41.40464, 41.3878013, 41.4136467, 41.390564, 41.3868788, 41.3865356, 41.4125671, 41.3896482, 41.3850639, 41.3919671, 41.3999835, 41.386993, 41.3878405, 41.3917335, 41.3987172, 41.382774, 41.388265, 41.4091834, 41.3828898, 41.386879, 41.3917335, 41.3987302, 41.409183, 41.432923, 41.4344164, 41.3869842, 41.40922657),
             long = c(2.1806423, 2.1893701, 2.1274556, 2.152954, 2.1440196, 2.1278398, 2.1668985, 2.2021878, 2.1757278, 2.127962, 2.1418273, 2.1391142945205, 2.1436192, 2.1528498, 2.1814508559036, 2.1560665191209, 2.1893589, 2.1666726, 2.1704746, 2.171124, 2.203366, 2.1731507, 2.173404, 2.188301, 2.17087001, 2.17049986, 2.1677237, 2.155939, 2.1539378, 2.16165528, 2.19848871, 2.189327, 2.189431, 2.1644419, 2.13273168, 2.1464824, 2.1817558, 2.151513, 2.154447, 2.1280387, 2.1385689, 2.1438591, 2.1560561, 2.1714268, 2.2113698, 2.1659005, 2.1734034999999, 2.1757278, 2.1487078, 2.16995, 2.16675529999997, 2.1436192, 2.1528498, 2.12765960000001, 2.127915, 2.2021878, 2.1274947, 2.156056, 2.1436192, 2.1528875, 2.202188, 2.189361, 2.1544574, 2.1208967, 2.20219468),
             name = c("El Corte Inglés", "Babis", "Experiencia bebe - Barcelona", "ALI-BEY nens", "Bitti", "El Corte Inglés", "Abitare Kids Barruguet", "Noari kids", "El Corte Inglés", "El Corte Inglés", "Pops And Co", "El Corte Inglés", "BITTI", "Ali Bey", "El Corte Inglés", "Tienda bebés Barcelona - Nonetes & Nou mesos", "Babis", "Abitare Kids - Barruguet", "Palacio del bebe", "El Corte Inglés", "Noari Kids", "El Món D'en Dadà", "Els Tresors de la Panera", "Paloma", "Prenatal Barcelona", "Palacio del Bebé", "El Corte Inglés Barcelona", "Rabasa", "Nonetes", "Prenatal Barcelona", "Prenatal Barcelona", "Babi's", "Nou Mesos", "BCN Bebé", "Nadons", "El Corte Inglés Barcelona", "El Corte Inglés Barcelona", "La Mama Vaca", "Kangura Portabebes ", "El Corte Inglés Barcelona", "Bitti", "Corte Ingles", "Nonetes", "Corte Ingles", "Corte Ingles", "MAINADA KIDS", "LES 4 LLUNES", "EL PALACIO DEL BEBE", "CASTELLBELL", "EL CORTE INGLÉS PLAZA CATALUñA (002)", "ABITARE KIDS BARRUGUET BARCELONA", "BITTI", "ALÍ BEY NENS", "EXPERIENCIA BEBÉ BARCELONA", "EL CORTE INGLÉS DIAGONAL (007)", "NOARI KIDS- BARCELONA", "Experiencia Bebé (Barcelona)", "Nonetes", "Bitti", "Ali Bey Nens", "Noari Kids", "Babis", "Nonetes & Nou mesos", "Nonetes & Nou mesos", "Noari Kids Barcelona"),
             brand = c("b", "b", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "e", "m", "m", "m", "m", "m", "sc", "sc", "sc", "sc", "s", "s", "s", "s", "s", "s", "s", "e", "e", "e", "e", "e", "e", "n", "n", "n") ) %>% 
  mutate(name_key = tolower(name),
         name_key = iconv(name_key, from="UTF-8",to="ASCII//TRANSLIT"),
         name_key = str_remove_all(name_key, '[[:digit:]]'),
         name_key = str_remove_all(name_key, '[:punct:]'), 
         name_key = str_remove_all(name_key, '[:space:]'),
         # Remove the common company types from the name
         name_key = str_remove_all(name_key, 'ltda'),
         name_key = str_remove_all(name_key, 'ltd'),
         # Only get the first 5 letters
         name_key = str_sub(name_key, end = 5L))


tt <- tt %>% 
  st_as_sf(coords = c("long", "lat"),
           crs = 4326) %>% 
  mutate(id = row_number())

tt

tt_buffer <- tt %>% 
  st_buffer(100)


tt <- tt %>% 
  mutate(intersects = st_intersects(.,
                                    tt_buffer))


tt_long <- unnest(tt, intersects) %>% 
  arrange(name, intersects)

tt <- tt %>% 
  st_drop_geometry() %>% 
  select(-intersects)

tt_long <- tt_long %>% 
  tidylog::left_join(.,
                     tt,
                     by = c("intersects" = "id"))
NorthNW
  • 157
  • 6
0

Convert your tibble to a sf object before passing to leaflet:

tt_sf <- st_as_sf(tt, coords = c("long", "lat"), crs=4326)

leaflet() %>%
  # add different provider tiles
  addProviderTiles(
    "OpenStreetMap",
    # give the layer a name
    group = "OpenStreetMap") %>% 
  addCircleMarkers(data=tt_sf, 
                   radius = 4,
                   opacity = 0.7,
                   label = paste(
                     "Store name: ", tt$name, "<br>",
                     "name_id: ", tt$name_key,"<br>",
                     "Brand: ", tt$brand) %>%
                     lapply(htmltools::HTML))
BEVAN
  • 426
  • 2
  • 12
0

Thank you all for your answers.

At the end I sort of figure it out. If you buff the polygon far enough, you will eventually have all the polygons with the same name_key intersecting with each other. I settled with 250m for my case, but it might be different in yours depending on the coordinates accuracy.

  1. You should arrange the data by the name_key, lat and long. It is important for the sapply unlist later (need to keep the same order).
tt <- tt %>% arrange(name_key)
  1. Make the polygon and split it by the name_key. This will allow you to compare polygons that have the same name_key.
tt$flag <- tt %>%
  st_as_sf(coords = c("long", "lat"), remove = FALSE, crs = 4326) %>% 
  st_buffer(100) %>% 
  split(all_store_data$name_id) %>% 
  # Check if polygon touches any other polygon with same name_id
  sapply(function(x) st_intersects(x)) %>%
  unlist(recursive = FALSE)

3.-. . Create a custom polygon_key

tt %>%
  mutate(poly_key = paste(name_key, flag))
Xavier
  • 129
  • 7