0

The algorithm DBSCAN should be able to group points together in clusters based on their proximity.

However, the example in GCB Bigquery has only the number of clusters.

WITH Geos as
  (SELECT 1 as row_id, st_geogfromtext('point empty') as geo UNION ALL
    SELECT 2, st_geogfromtext('multipoint(1 1, 2 2, 4 4, 5 2)') UNION ALL
    SELECT 3, st_geogfromtext('point(14 15)') UNION ALL
    SELECT 4, st_geogfromtext('linestring(40 1, 42 34, 44 39)') UNION ALL
    SELECT 5, st_geogfromtext('polygon((40 2, 40 1, 41 2, 40 2))'))
SELECT row_id, geo, ST_CLUSTERDBSCAN(geo, 1e5, 1) OVER () AS cluster_num FROM
Geos ORDER BY row_id
+--------+-----------------------------------+-------------+
| row_id |                geo                | cluster_num |
+--------+-----------------------------------+-------------+
|      1 |          GEOMETRYCOLLECTION EMPTY |        NULL |
|      2 |    MULTIPOINT(1 1, 2 2, 5 2, 4 4) |           0 |
|      3 |                      POINT(14 15) |           1 |
|      4 |    LINESTRING(40 1, 42 34, 44 39) |           2 |
|      5 | POLYGON((40 2, 40 1, 41 2, 40 2)) |           2 |
+--------+-----------------------------------+-------------+

Is there any way to get a list of points for each individual cluster? I'm not that familiar with the OVER() notation.

Thadeu Melo
  • 103
  • 2

1 Answers1

1

You can GROUP BY the result by cluster_num, e.g.

WITH Geos as
  (SELECT 1 as row_id, st_geogfromtext('point empty') as geo UNION ALL
    SELECT 2, st_geogfromtext('multipoint(1 1, 2 2, 4 4, 5 2)') UNION ALL
    SELECT 3, st_geogfromtext('point(14 15)') UNION ALL
    SELECT 4, st_geogfromtext('linestring(40 1, 42 34, 44 39)') UNION ALL
    SELECT 5, st_geogfromtext('polygon((40 2, 40 1, 41 2, 40 2))')),
Clusters as 
  (SELECT row_id, geo, ST_CLUSTERDBSCAN(geo, 1e5, 1) OVER () AS cluster_num 
    FROM Geos)
SELECT cluster_num, ARRAY_AGG(row_id) as rowids, ST_Union_Agg(geo) as geo
FROM Clusters GROUP BY cluster_num

Gives you

Row cluster_num rowids  geo 
1   null        [1]     GEOMETRYCOLLECTION EMPTY
2   0           [2]     MULTIPOINT(1 1, 2 2, 5 2, 4 4)
3   1           [3]     POINT(14 15)
4   2           [4,5]   GEOMETRYCOLLECTION(LINESTRING(40.0531529143594 2.00001531864666, 42 34, 44 39), POLYGON((40 2, 40 1, 41 2, 40.0531529143594 2.00001531864666, 40 2)))
Michael Entin
  • 1,017
  • 5
  • 8