6

I'm working with transit stops in GTFS. I want to group together stops (N~8000 for a medium sized agency in the USA) which are proximately related in order to reduce the number of dimensions but also group together stops that are a reasonable distance from each other: e.g. transfers. K-means requires an a priori specification of the number of clusters, whereas I would like the cluster diameter to be specified by the user. A bottom-up hierarchical clustering algorithm which stops at a specified diameter is what I need.

raphael
  • 3,407
  • 23
  • 61

1 Answers1

10

Previously I'd written a hierarchical clustering algorithm that operated on small groups of points, but it did not scale well to an 8000 point cloud. After some tinkering I got a revamped version to work. It does 8000 points in under 30s on a server, scaling at something approximating N*log(N).

First two tables definitions are required:

CREATE TABLE pt
(
  stop_id character varying(32) NOT NULL,
  geom geometry(Point)
)

And the function result definition:

CREATE TABLE clustered_pt
(
  stop_id character varying(32) NOT NULL,
  geom geometry(Point),
  cluster_id smallint
)

I haven't tested with more than 32000 points, but modify the type of cluster_id accordingly if you're going to use larger datasets.

CREATE OR REPLACE FUNCTION bottomup_cluster_index(points pt[], radius integer)
  RETURNS SETOF clustered_pt AS
$BODY$

DECLARE
    srid int;
    counter int:=1;

BEGIN
--Avoid the whole processing if there's only 1 point. 
IF array_length(points,1)<2 THEN
    RETURN QUERY SELECT stop_id::varchar(32), geom::geometry(point), 1 FROM unnest(points);
    RETURN;
END IF;


CREATE TEMPORARY TABLE IF NOT EXISTS stops (LIKE pt) ON COMMIT DROP;

CREATE TEMPORARY SEQUENCE clusterids;

CREATE TEMPORARY TABLE clusters(
    stop_group geometry,
    stop_ids varchar[],
    cluster_id smallint DEFAULT nextval('clusterids')
    ) ON COMMIT DROP;


ALTER SEQUENCE clusterids OWNED BY clusters.cluster_id;



TRUNCATE stops;
    --inserting points in 
INSERT INTO stops(stop_id, geom)
    (SELECT (unnest(points)).* ); 

--Store the srid to reconvert points after, assumes all points have the same SRID
srid := ST_SRID(geom) FROM stops LIMIT 1;

--Transforming points to a UTM coordinate system so distances will be calculated in meters.
UPDATE stops
SET geom =  ST_TRANSFORM(geom,26986);

INSERT INTO clusters(stop_group, stop_ids)
    (SELECT ST_COLLECT(geom), ARRAY_AGG(stop_id)
        FROM stops GROUP BY geom --Groups together points which are at the same location
    );

CREATE INDEX geom_index
ON clusters
USING gist
(stop_group);

Analyze clusters;

LOOP
    --If the shortest maximum distance between two clusters is greater than 2x the specified radius, then end the clustering algorithm.
    IF (SELECT ST_MaxDistance(a.stop_group,b.stop_group)  FROM clusters a, clusters b
        WHERE 
        ST_DFullyWithin(a.stop_group,b.stop_group, 2 * radius)
        AND a.cluster_id < b.cluster_id AND a.cluster_id > 0 AND b.cluster_id > 0
        ORDER BY ST_MaxDistance(a.stop_group,b.stop_group) LIMIT 1)
        IS NULl
    THEN
        EXIT;
    END IF;

    --Periodically reindex the clusters table
    ANALYZE clusters;

    counter := counter +1;

    WITH finding_nearest_clusters AS(
    SELECT DISTINCT ON (a.cluster_id) a.cluster_id, ST_collect(a.stop_group,b.stop_group) AS stop_group, ARRAY[a.cluster_id,b.cluster_id] as joined_clusters, a.stop_ids||b.stop_ids AS stop_ids
    FROM clusters a, clusters b
        WHERE ST_DFullyWithin(a.stop_group,b.stop_group, 2 * radius)
            AND a.cluster_id < b.cluster_id AND a.cluster_id > 0 AND b.cluster_id > 0
        ORDER BY a.cluster_id, ST_MaxDistance(a.stop_group,b.stop_group)
    )
    --If a cluster is linked to multiple nearest clusters, select only the shortest distance pairing, and flag the others.
    , unique_clusters AS(
    SELECT a.*, CASE WHEN ST_AREA(ST_MinimumBoundingCircle(a.stop_group))>= ST_AREA(ST_MinimumBoundingCircle(b.stop_group)) THEN 1 ELSE 0 END as repeat_flag 
    FROM finding_nearest_clusters a
    LEFT OUTER JOIN finding_nearest_clusters b ON a.cluster_id <> b.cluster_id AND a.joined_clusters && b.joined_clusters 
    )       
        --Update the set of clusters with the new clusters
    UPDATE clusters o SET 
        --Set to 0 the cluster_id of the cluster which will contain 0 data.
        cluster_id = CASE WHEN o.cluster_id = joined_clusters[2] THEN 0 ELSE joined_clusters[1] END
        ,stop_group = CASE WHEN o.cluster_id = joined_clusters[2] THEN NULL ELSE f.stop_group END
        ,stop_ids = CASE WHEN o.cluster_id = joined_clusters[2] THEN NULL ELSE f.stop_ids END
        FROM (SELECT DISTINCT ON (cluster_id) cluster_id, stop_group, joined_clusters, stop_ids, repeat_flag
            FROM unique_clusters 
            ORDER BY cluster_id, repeat_flag DESC
            ) f
        WHERE o.cluster_id = ANY (joined_clusters) AND repeat_flag =0;

    IF (SELECT COUNT(DISTINCT cluster_id) FROM clusters) < 2    THEN
        EXIT;                           
    END IF;

END LOOP;

RAISE NOTICE USING MESSAGE = $$Number of passes $$||counter;

RETURN QUERY 
    SELECT stop_id::varchar(32), ST_TRANSFORM(geom, srid)::geometry(point), cluster_id 
    FROM stops
    inner join (select cluster_id, unnest(stop_ids) AS stop_id FROM clusters)c USING (stop_id);
END;
$BODY$
  LANGUAGE plpgsql VOLATILE
 ;

Usage:

SELECT (clusters).* FROM (

    SELECT bottomup_cluster_index(array_agg((stop_id,geom)::pt), 250) as clusters 
    FROM  points
)a

Further optimization is welcome!

raphael
  • 3,407
  • 23
  • 61
  • Nice function. I might have a use for that. – John Powell Apr 28 '15 at 04:57
  • This can also easily be adapted to those looking for minimum-linkage clustering: larger clusters of points where no point is further than x from its closest neighbour. Though I'm not sure it's the fastest way, since the loop has to run m-1 times, where m is the number of points in the largest cluster. – raphael Apr 28 '15 at 15:42