Network distance tying it all together¶

Here, we will calculate the walking distance for students in public schools to their nearest subway entrance.

[37]:

import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from shapely.geometry import box, LineString, Point,MultiPoint
import os
import sys
from scipy.spatial import cKDTree
import numpy as np
from zipfile import ZipFile as zzip
import requests
from rtree import index
import rtree
import itertools
from shapely import wkt
from shapely.ops import split, snap
import time

sys.path.append(os.path.realpath('..'))

[6]:

os.getcwd()

[6]:

'C:\\Work\\Box Sync\\GIS_exploration\\code'

Get the 2019 “NYC LION Single Line Street Base Map”¶

[29]:

url = r"https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyclion_19b.zip"
filename = r"nyclion_19b.zip"
r = requests.get(url)
# open method to open a file on your system and write the contents
with open("../input_data/"+filename, "wb") as file:
    file.write(r.content)

[30]:

fp = "../input_data/"
foldername = "nyclion_19b"
with zzip(fp+filename, 'r') as file:
        #printing all the contents of the zip file
        #file.printdir()
        path = "../input_data/"+foldername
        os.mkdir(path)
        # extracting all the files
        print('Extracting all the files now...')
        file.extractall(fp+foldername)
        print('Done!')

File Name                                             Modified             Size
lion/lion.gdb/a00000001.freelist               2019-05-13 12:48:56         4440
lion/lion.gdb/a00000001.gdbindexes             2019-05-13 12:42:56          110
lion/lion.gdb/a00000001.gdbtable               2019-05-13 12:48:56          385
lion/lion.gdb/a00000001.gdbtablx               2019-05-13 12:48:56         5152
lion/lion.gdb/a00000001.TablesByName.atx       2019-05-13 12:48:56         4118
lion/lion.gdb/a00000002.gdbtable               2019-05-13 12:42:56         2055
lion/lion.gdb/a00000002.gdbtablx               2019-05-13 12:42:56         5152
lion/lion.gdb/a00000003.gdbindexes             2019-05-13 12:42:56           42
lion/lion.gdb/a00000003.gdbtable               2019-05-13 12:44:22         1825
lion/lion.gdb/a00000003.gdbtablx               2019-05-13 12:44:22         5152
lion/lion.gdb/a00000004.CatItemsByPhysicalName.atx 2019-05-13 12:48:56         4118
lion/lion.gdb/a00000004.CatItemsByType.atx     2019-05-13 12:48:56         4118
lion/lion.gdb/a00000004.FDO_UUID.atx           2019-05-13 12:48:56         4118
lion/lion.gdb/a00000004.freelist               2019-05-13 12:49:38        57688
lion/lion.gdb/a00000004.gdbindexes             2019-05-13 12:42:56          310
lion/lion.gdb/a00000004.gdbtable               2019-05-13 12:49:38       399741
lion/lion.gdb/a00000004.gdbtablx               2019-05-13 12:49:38         5152
lion/lion.gdb/a00000004.spx                    2019-05-13 12:49:30        45078
lion/lion.gdb/a00000005.CatItemTypesByName.atx 2019-05-13 12:42:56        12310
lion/lion.gdb/a00000005.CatItemTypesByParentTypeID.atx 2019-05-13 12:42:56         4118
lion/lion.gdb/a00000005.CatItemTypesByUUID.atx 2019-05-13 12:42:56         4118
lion/lion.gdb/a00000005.gdbindexes             2019-05-13 12:42:56          296
lion/lion.gdb/a00000005.gdbtable               2019-05-13 12:42:56         1803
lion/lion.gdb/a00000005.gdbtablx               2019-05-13 12:42:56         5152
lion/lion.gdb/a00000006.CatRelsByDestinationID.atx 2019-05-13 12:48:56         4118
lion/lion.gdb/a00000006.CatRelsByOriginID.atx  2019-05-13 12:48:56         4118
lion/lion.gdb/a00000006.CatRelsByType.atx      2019-05-13 12:48:56         4118
lion/lion.gdb/a00000006.FDO_UUID.atx           2019-05-13 12:48:56         4118
lion/lion.gdb/a00000006.freelist               2019-05-13 12:48:56         4440
lion/lion.gdb/a00000006.gdbindexes             2019-05-13 12:42:56          318
lion/lion.gdb/a00000006.gdbtable               2019-05-13 12:48:56          701
lion/lion.gdb/a00000006.gdbtablx               2019-05-13 12:48:56         5152
lion/lion.gdb/a00000007.CatRelTypesByBackwardLabel.atx 2019-05-13 12:42:56         4118
lion/lion.gdb/a00000007.CatRelTypesByDestItemTypeID.atx 2019-05-13 12:42:56         4118
lion/lion.gdb/a00000007.CatRelTypesByForwardLabel.atx 2019-05-13 12:42:56         4118
lion/lion.gdb/a00000007.CatRelTypesByName.atx  2019-05-13 12:42:56         4118
lion/lion.gdb/a00000007.CatRelTypesByOriginItemTypeID.atx 2019-05-13 12:42:56         4118
lion/lion.gdb/a00000007.CatRelTypesByUUID.atx  2019-05-13 12:42:56         4118
lion/lion.gdb/a00000007.gdbindexes             2019-05-13 12:42:56          602
lion/lion.gdb/a00000007.gdbtable               2019-05-13 12:42:56         2504
lion/lion.gdb/a00000007.gdbtablx               2019-05-13 12:42:56         5152
lion/lion.gdb/a0000000a.gdbindexes             2019-05-13 12:44:48          116
lion/lion.gdb/a0000000a.gdbtable               2019-05-13 12:44:48      3247250
lion/lion.gdb/a0000000a.gdbtablx               2019-05-13 12:44:48       665632
lion/lion.gdb/a0000000a.spx                    2019-05-13 12:44:48      1609750
lion/lion.gdb/a0000000b.gdbindexes             2019-05-13 12:44:56           66
lion/lion.gdb/a0000000b.gdbtable               2019-05-13 12:45:24      5491953
lion/lion.gdb/a0000000b.gdbtablx               2019-05-13 12:45:24      1172512
lion/lion.gdb/a0000000c.gdbindexes             2019-05-13 12:45:30           66
lion/lion.gdb/a0000000c.gdbtable               2019-05-13 12:45:42      6406110
lion/lion.gdb/a0000000c.gdbtablx               2019-05-13 12:45:42       527392
lion/lion.gdb/a0000000d.freelist               2019-05-13 12:48:44        20824
lion/lion.gdb/a0000000d.gdbindexes             2019-05-13 12:48:30          116
lion/lion.gdb/a0000000d.gdbtable               2019-05-13 12:48:44    126137796
lion/lion.gdb/a0000000d.gdbtablx               2019-05-13 12:48:44      1136672
lion/lion.gdb/a0000000d.spx                    2019-05-13 12:48:30      4362262
lion/lion.gdb/gdb                              2019-05-13 12:42:56            4
lion/lion.gdb/timestamps                       2019-05-13 12:47:14          400
lion/LION - Street Name Labels.lyr             2013-09-24 18:10:22        18432
lion/LION Streets - Generic.lyr                2017-02-10 07:51:36        21504
lion/LION Streets - Roadbeds.lyr               2017-02-10 07:54:38        21504
lion/LION - Generic.lyr                        2017-02-10 07:48:08        24576
lion/LION - Roadbeds.lyr                       2017-02-10 07:49:32        24576
lion/LION - Nodes.lyr                          2012-08-17 13:19:50        11776
lion/LION - Street Direction Arrows (ArcGIS 10x).lyr 2013-01-15 10:58:06        22528
lion/LION - Street Direction Arrows (Requires Maplex).lyr 2013-03-12 14:22:44        22528
lion/TrafficRepArrows.style                    2012-09-27 08:32:40       598016
lion/ReadMe.txt                                2014-03-17 15:27:04         7758
altnames_metadata.html                         2019-05-13 12:49:22        38976
altnames_metadata.pdf                          2019-05-13 14:35:58       129541
lion_metadata.html                             2019-05-13 12:49:12       199830
lion_metadata.pdf                              2019-05-13 14:35:22       483768
node_metadata.html                             2019-05-13 12:49:32        42396
node_metadata.pdf                              2019-05-13 14:34:28       135027
node_stname_metadata.html                      2019-05-13 12:49:40        32999
node_stname_metadata.pdf                       2019-05-13 14:36:56       123835
Extracting all the files now...
Done!

Reading the lion layer into a geopandas dataframe

[31]:

fp = "../input_data/nyclion_19b/lion/lion.gdb"
lion_gdf = gpd.read_file(fp, driver='FileGDB', layer='lion')

Removing the non pedestrian accessible roads/paths

[32]:

lion_gdf['todrop'] = (lion_gdf['NonPed'] == 'V') |(lion_gdf['FeatureTyp'].isin(['F','9','1','7','3'])) |(lion_gdf['TrafDir'].isin([' ']))

[33]:

lion_gdf = lion_gdf.loc[lion_gdf['todrop'] == False]

[34]:

lion_gdf.crs

[34]:

{'init': 'epsg:2263'}

[41]:

lion_gdf.plot(figsize = (10,10), markersize = 1);

_images/Network_distance_example_12_0.png

[46]:

lion_gdf.head()

[46]:

	Street	SAFStreetName	FeatureTyp	SegmentTyp	RB_Layer	TrafDir	TrafSrc	SpecAddr	...	RLo_Hyphen	RHi_Hyphen	FromLeft	ToLeft	FromRight	ToRight	Join_ID	SHAPE_Length	geometry	todrop
0	EAST 168 STREET		0	U	B	T	DOT		...	596	716	599	699	596	716	2251001000000	396.030947	(LINESTRING (1010964.446978778 241812.26140345...	False
1	WEST 192 STREET		0	U	B	A	DOT		...	63	99	58	98	63	99	2798401000000	279.360514	(LINESTRING (1011576.686607853 255023.58303095...	False
2	UNION AVENUE		0	U	B	W	DOT		...	1016	1084	1017	1079	1016	1084	2728001000000	618.327133	(LINESTRING (1011600.676209003 239639.74280026...	False
3	UNION AVENUE	BEHAGEN PLAYGROUND	0	U	B	W	DOT	N	...	None	None	0	0	0	0	21279501000000N	618.327133	(LINESTRING (1011600.676209003 239639.74280026...	False
4	DELAFIELD AVENUE		6	U	B	T	DOT		...	4600	4664	4601	4645	4600	4664	2187601000000	670.281037	(LINESTRING (1009974.212831751 264857.13919802...	False

5 rows × 119 columns

[54]:

fp = r"../intermediate_data/nyclion_19b/"
os.mkdir(fp)
lion_gdf[['geometry']].to_file(fp+"nyclion_19b.shp")

Getting the public school location data from NYC Open Data¶

[19]:

url = r"https://data.cityofnewyork.us/download/jfju-ynrr/application%2Fzip"
filename = r"Public_School_Locations.zip"
r = requests.get(url)
# open method to open a file on your system and write the contents
with open("../input_data/"+filename, "wb") as file:
    file.write(r.content)

[23]:

fp = "../input_data/"
foldername = "Public_School_Locations"
with zzip(fp+filename, 'r') as file:
        # printing all the contents of the zip file
        file.printdir()
        path = "../input_data/"+foldername
        os.mkdir(path)
        # extracting all the files
        print('Extracting all the files now...')
        file.extractall(fp+foldername)
        print('Done!')

File Name                                             Modified             Size
Public_Schools_Points_2011-2012A.dbf           2011-08-26 10:30:16       771337
Public_Schools_Points_2011-2012A.prj           2010-10-12 11:55:12          562
Public_Schools_Points_2011-2012A.sbn           2011-08-03 11:23:00        16580
Public_Schools_Points_2011-2012A.sbx           2011-08-03 11:23:00          868
Public_Schools_Points_2011-2012A.shp           2011-08-03 11:23:00        47952
Public_Schools_Points_2011-2012A.shp.xml       2011-08-26 10:32:50        28413
Public_Schools_Points_2011-2012A.shx           2011-08-03 11:23:00        13772
Extracting all the files now...
Done!

Read the school point shapefile into a geopandas geodataframe

[2]:

fp = "../input_data/Public_School_Locations/Public_Schools_Points_2011-2012A.shp"
schools = gpd.read_file(fp)

[3]:

schools.crs

[3]:

{'init': 'epsg:2263'}

[42]:

schools.head()

[42]:

	ATS_CODE	BORO	BORONUM	LOC_CODE	SCHOOLNAME	SCH_TYPE	MANAGED_BY	GEO_DISTRI	ADMIN_DIST	ADDRESS	STATE_CODE	ZIP	PRINCIPAL	PRIN_PH	FAX	GRADES	City	geometry
0	15K001	K	2.0	K001	P.S. 001 THE BERGEN	Elementary	1	15	15	309 47 STREET	NY	11220	Jennifer Eusanio	718-567-7661	718-567-9771	PK,0K,01,02,03,04,05,SE	BROOKLYN	POINT (980985.0541713729 175780.758994163)
1	17K002	K	2.0	K002	M.S. 002	Junior High-Intermediate-Middle	1	17	17	655 PARKSIDE AVENUE	NY	11226	ADRIENNE SPENCER	718-462-6992	718-284-7717	06,07,08,SE	BROOKLYN	POINT (997785.1364296663 178431.910832498)
2	21K095	K	2.0	K095	P.S. 095 THE GRAVESEND	K-8	1	21	21	345 VAN SICKLEN STREET	NY	11223	Janet Ndzibah	718-449-5050	718-449-3047	PK,0K,01,02,03,04,05,06,07,08,SE	BROOKLYN	POINT (991224.9830063033 156323.3612131244)
3	21K096	K	2.0	K096	I.S. 096 SETH LOW	Junior High-Intermediate-Middle	1	21	21	99 AVENUE P	NY	11204	Denise Sandra Levinsky	718-236-1344	718-236-2397	06,07,08,SE	BROOKLYN	POINT (988439.6813609767 160709.9926236183)
4	21K097	K	2.0	K097	P.S. 97 THE HIGHLAWN	Elementary	1	21	21	1855 STILLWELL AVENUE	NY	11223	KRISTINE MUSTILLO	718-372-7393	718-372-3842	PK,0K,01,02,03,04,05,SE	BROOKLYN	POINT (988205.0729096507 158329.5913366843)

[115]:

schools['BORONUM'].value_counts()

[115]:

0    902
0    348
0    341
0     73
0     43
0      2
Name: BORONUM, dtype: int64

[170]:

schools.plot(figsize = (10,10), markersize = 1, color = 'blue');

_images/Network_distance_example_23_0.png

Getting the subway information¶

Read the subway entract shapefile into a geopandas geodataframe

[4]:

fp = r"https://data.cityofnewyork.us/api/geospatial/drex-xx56?method=export&format=Shapefile"
subway = gpd.read_file(fp)

[5]:

subway.crs

[5]:

{'init': 'epsg:4326'}

[6]:

subway = subway.to_crs({'init': 'epsg:2263'})

[7]:

subway.head()

[7]:

	line	name	objectid	url	geometry
0	2-5	Birchall Ave & Sagamore St at NW corner	1734.0	http://web.mta.info/nyct/service/	POINT (1020670.668616476 248679.9905504963)
1	2-5	Birchall Ave & Sagamore St at NE corner	1735.0	http://web.mta.info/nyct/service/	POINT (1020710.253511059 248665.112299423)
2	2-5	Morris Park Ave & 180th St at NW corner	1736.0	http://web.mta.info/nyct/service/	POINT (1019251.986414479 245782.8697043241)
3	2-5	Morris Park Ave & 180th St at NW corner	1737.0	http://web.mta.info/nyct/service/	POINT (1019419.817771734 245866.9102186682)
4	2-5	Boston Rd & 178th St at SW corner	1738.0	http://web.mta.info/nyct/service/	POINT (1017557.721758398 245631.8322767556)

[8]:

subway = subway.reset_index()

[9]:

subway['index'] = subway.index

[168]:

subway.plot(figsize = (10,10), markersize = 1, color = 'red');

_images/Network_distance_example_32_0.png

Adding these points into the street network¶

[55]:

fp = r"../intermediate_data/nyclion_19b/"
lion_graph = nx.read_shp(fp+"nyclion_19b.shp", simplify = True, geom_attrs = True, strict = True)

[56]:

fp = r"../intermediate_data/nyclion_19b/"
nx.write_shp(lion_graph, fp)

[57]:

fp = r"../intermediate_data/nyclion_19b/"
# load as GeoDataFrame
nodes = gpd.read_file(fp+'nodes.shp')
edges = gpd.read_file(fp+'edges.shp')

[63]:

nodes.crs = {'init': 'epsg:2263'}
edges.crs = {'init': 'epsg:2263'}

Create a point geopandas dataframe of these points.

[43]:

points_to_add = gpd.GeoDataFrame(subway[['geometry']].append(schools[['geometry']], sort = False), crs = {'init': 'epsg:2263'})

[58]:

points_to_add = points_to_add.reset_index()

[59]:

points_to_add['key'] = points_to_add.index

This AWESOME function connect_poi() is adapted from Yuwen Chang

This is slightly modified from Yuwen’s version.

[81]:

def connect_poi(pois, nodes, edges, key_col=None, path=None, threshold=600, knn=5):
    """Connect and integrate a set of POIs into an existing road network.
    Given a road network in the form of two GeoDataFrames: nodes and edges,
    link each POI to the nearest edge (road segment) based on its projection
    point (PP) and generate a new integrated road network including the POIs,
    the projected points, and the connection edge.
    Args:
        pois (GeoDataFrame): a gdf of POI (geom: Point)
        nodes (GeoDataFrame): a gdf of road network nodes (geom: Point)
        edges (GeoDataFrame): a gdf of road network edges (geom: LineString)
        key_col (str): a unique key column of pois should be provided,
                       e.g., 'index', 'osmid', 'poi_number', etc.
                       Currently, this will be renamed into 'osmid' in the output.
                       [NOTE] For use in pandana, you may want to ensure this
                              column is numeric-only to avoid processing errors.
                              Preferably use unique integers (int or str) only,
                              and be aware not to intersect with the node key,
                              'osmid' if you use OSM data, in the nodes gdf.
        path (str): directory path to use for saving files (nodes and edges).
                      Outputs will NOT be saved if this arg is not specified.
        threshold (int): the max length of a POI connection edge, POIs with
                         connection edge beyond this length will be removed.
                         The unit is in meters as crs epsg is set to 3857 by
                         default during processing.
        knn (int): k nearest neighbors to query for the nearest edge.
                   Consider increasing this number up to 10 if the connection
                   output is slightly unreasonable. But higher knn number will
                   slow down the process.
    Returns:
        nodes (GeoDataFrame): the original gdf with POIs and PPs appended
        edges (GeoDataFrame): the original gdf with connection edges appended
                              and existing edges updated (if PPs are present)
    Note:
        1. Make sure all three input GeoDataFrames have defined crs attribute.
           Try something like `gdf.crs` or `gdf.crs = {'init': 'epsg:4326'}`.
           They will then be converted into epsg:3857 for processing.
    """

    ## STAGE 0: initialization
    # 0-1: helper functions
    def find_kne(point, lines):
        dists = np.array(list(map(lambda l: l.distance(point), lines)))
        kne_pos = dists.argsort()[0]
        kne = lines.iloc[[kne_pos]]
        kne_idx = kne.index[0]
        return kne_idx, kne.values[0]

    def get_pp(point, line):
        """Get the projected point (pp) of 'point' on 'line'."""
        # project new Point to be interpolated
        pp = line.interpolate(line.project(point))  # PP as a Point
        return pp

    def split_line(line, pps):
        """Split 'line' by all intersecting 'pps' (as multipoint).
        Returns:
            new_lines (list): a list of all line segments after the split
        """
        # IMPORTANT FIX for ensuring intersection between splitters and the line
        # but no need for updating edges_meter manually because the old lines will be
        # replaced anyway
        line = snap(line, pps, 1e-8)  # slow?

        try:
            new_lines = list(split(line, pps))  # split into segments
            return new_lines
        except TypeError as e:
            print('Error when splitting line: {}\n{}\n{}\n'.format(e, line, pps))
            return []

    def update_nodes(nodes, new_points, ptype):
        """Update nodes with a list (pp) or a GeoDataFrame (poi) of new_points.

        Args:
            ptype: type of Point list to append, 'pp' or 'poi'
        """
        # create gdf of new nodes (projected PAPs)
        if ptype == 'pp':
            new_nodes = gpd.GeoDataFrame(new_points, columns=['geometry'],
                                         crs={'init': 'epsg:2263'})
            n = len(new_nodes)
            new_nodes['highway'] = node_highway_pp
            new_nodes['osmid'] = [str(osmid_prefix + i) for i in range(n)]

        # create gdf of new nodes (original POIs)
        elif ptype == 'poi':
            new_nodes = new_points[['geometry', key_col]]
            new_nodes.columns = ['geometry', 'osmid']
            new_nodes['highway'] = node_highway_poi
            new_nodes['osmid'].astype(str, inplace=True)

        else:
            print("Unknown ptype when updating nodes.")

        # merge new nodes (it is safe to ignore the index for nodes)
        gdfs = [nodes, new_nodes]
        nodes = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True, sort=False),
                                 crs=gdfs[0].crs)

        return nodes, new_nodes  # all nodes, newly added nodes only

    def update_edges(edges, new_lines, replace):
        """
        Update edge info by adding new_lines; or,
        replace existing ones with new_lines (n-split segments).
        Args:
            replace: treat new_lines (flat list) as newly added edges if False,
                     else replace existing edges with new_lines (often a nested list)

        Note:
            kne_idx refers to 'fid in Rtree'/'label'/'loc', not positional iloc
        """
        # for interpolation (split by pp): replicate old line
        if replace:
            # create a flattened gdf with all line segs and corresponding kne_idx
            kne_idxs = list(line_pps_dict.keys())
            lens = [len(item) for item in new_lines]
            new_lines_gdf = gpd.GeoDataFrame(
                {'kne_idx': np.repeat(kne_idxs, lens),
                 'geometry': list(itertools.chain.from_iterable(new_lines))})
            # merge to inherit the data of the replaced line
            cols = list(edges.columns)
            cols.remove('geometry')  # don't include the old geometry
            new_edges = new_lines_gdf.merge(edges[cols], how='left',
                                            left_on='kne_idx', right_index=True)
            new_edges.drop('kne_idx', axis=1, inplace=True)
            new_lines = new_edges['geometry']  # now a flatten list
        # for connection (to external poi): append new lines
        else:
            new_edges = gpd.GeoDataFrame(pois[[key_col]], geometry=new_lines,
                                         columns=[key_col, 'geometry'])
            new_edges['oneway'] = False
            new_edges['highway'] = edge_highway

        # update features (a bit slow)
        new_edges['length'] = [l.length for l in new_lines]
        new_edges['from'] = new_edges['geometry'].map(
            lambda x: nodes_id_dict.get(list(x.coords)[0], 'None'))
        new_edges['to'] = new_edges['geometry'].map(
            lambda x: nodes_id_dict.get(list(x.coords)[-1], 'None'))
        new_edges['osmid'] = ['_'.join(s) for s in zip(new_edges['from'],
                                                       new_edges['to'])]

        # remember to reindex to prevent duplication when concat
        start = edges.index[-1] + 1
        stop = start + len(new_edges)
        new_edges.index = range(start, stop)

        # for interpolation: remove existing edges
        if replace:
            edges = edges.drop(kne_idxs, axis=0)
        # for connection: filter invalid links
        else:
            valid_pos = np.where(new_edges['length'] <= threshold)[0]
            n = len(new_edges)
            n_fault = n - len(valid_pos)
            f_pct = n_fault / n * 100
            print("Remove faulty projections: {}/{} ({:.2f}%)".format(n_fault,
                                                                      n,
                                                                      f_pct))
            new_edges = new_edges.iloc[valid_pos]  # use 'iloc' here

        # merge new edges
        dfs = [edges, new_edges]
        edges = gpd.GeoDataFrame(pd.concat(dfs, ignore_index=False, sort=False),
                                 crs=dfs[0].crs)

        # all edges, newly added edges only
        return edges, new_edges

    # 0-2: configurations
    # set poi arguments
    node_highway_pp = 'projected_pap'  # POI Access Point
    node_highway_poi = 'poi'
    edge_highway = 'projected_footway'
    osmid_prefix = 9990000000

    # convert CRS
    pois_meter = pois.to_crs(epsg=2263)
    nodes_meter = nodes.to_crs(epsg=2263)
    edges_meter = edges.to_crs(epsg=2263)

    # build rtree
    print("Building rtree...")
    Rtree = rtree.index.Index()
    [Rtree.insert(fid, geom.bounds) for fid, geom in
     edges_meter['geometry'].iteritems()];

    ## STAGE 1: interpolation
    # 1-1: update external nodes (pois)
    print("Updating external nodes...")
    nodes_meter, _ = update_nodes(nodes_meter, pois_meter, ptype='poi')

    # 1-2: update internal nodes (interpolated pps)
    # locate nearest edge (kne) and projected point (pp)
    print("Projecting POIs to the network...")
    pois_meter['near_idx'] = [list(Rtree.nearest(point.bounds, knn)) for point in
                              pois_meter['geometry']]  # slow
    pois_meter['near_lines'] = [edges_meter['geometry'][near_idx] for near_idx in
                                pois_meter['near_idx']]  # very slow
    pois_meter['kne_idx'], knes = zip(
        *[find_kne(point, near_lines) for point, near_lines in
          zip(pois_meter['geometry'], pois_meter['near_lines'])])  # slow
    pois_meter['pp'] = [get_pp(point, kne) for point, kne in
                        zip(pois_meter['geometry'], knes)]

    # update nodes
    print("Updating internal nodes...")
    nodes_meter, _ = update_nodes(nodes_meter, list(pois_meter['pp']),
                                  ptype='pp')
    nodes_coord = nodes_meter['geometry'].map(lambda x: x.coords[0])
    nodes_id_dict = dict(zip(nodes_coord, nodes_meter['osmid'].astype('str')))

    # 1-3: update internal edges (split line segments)
    print("Updating internal edges...")
    # split
    line_pps_dict = {k: MultiPoint(list(v)) for k, v in
                     pois_meter.groupby(['kne_idx'])['pp']}
    new_lines = [split_line(edges_meter['geometry'][idx], pps) for idx, pps in
                 line_pps_dict.items()]  # bit slow
    edges_meter, _ = update_edges(edges_meter, new_lines, replace=True)

    ## STAGE 2: connection
    # 2-1: update external edges (projected footways connected to pois)
    # establish new_edges
    print("Updating external links...")
    pps_gdf = nodes_meter[nodes_meter['highway'] == node_highway_pp]
    new_lines = [LineString([p1, p2]) for p1, p2 in
                 zip(pois_meter['geometry'], pps_gdf['geometry'])]
    edges_meter, _ = update_edges(edges_meter, new_lines, replace=False)

    ## STAGE 3: output
    # convert CRS
    nodes = nodes_meter.to_crs(epsg=2263)
    edges = edges_meter.to_crs(epsg=2263)

    # preprocess for pandana
    nodes.index = nodes['osmid']  # IMPORTANT
    nodes['x'] = [p.x for p in nodes['geometry']]
    nodes['y'] = [p.y for p in nodes['geometry']]

    # edges.reset_index(drop=True, inplace=True)
    edges['weight'] = edges['geometry'].length
    edges['weight'] = edges['weight'].astype(float)

    # report issues
    # - examine key duplication
    if len(nodes_meter) != len(nodes_id_dict):
        print("NOTE: duplication in node coordinates keys")
        print("Nodes count:", len(nodes_meter))
        print("Node coordinates key count:", len(nodes_id_dict))
    # - examine missing nodes
    print("Missing 'from' nodes:", len(edges[edges['from'] == 'None']))
    print("Missing 'to' nodes:", len(edges[edges['to'] == 'None']))

    # save and return
    if path:
        nodes.to_file(path+'/new_nodes.shp')
        edges[['geometry','weight']].to_file(path+'/new_edges.shp')
        print("new_nodes.shp and new_edges.shp are written into {}".format(path))

    return nodes, edges

[82]:

fp = r"../intermediate_data/nyclion_19b/"
new_nodes, new_edges =  connect_poi(points_to_add, nodes, edges, key_col='key', path=fp)

Building rtree...
Updating external nodes...
Projecting POIs to the network...
Updating internal nodes...
Updating internal edges...
Updating external links...
Remove faulty projections: 2/3637 (0.05%)
NOTE: duplication in node coordinates keys
Nodes count: 110931
Node coordinates key count: 110868
Missing 'from' nodes: 0
Missing 'to' nodes: 0
new_nodes.shp and new_edges.shp are written into ../intermediate_data/nyclion_19b/

Reading in the modified street network¶

[10]:

shp_file = r"../intermediate_data/nyclion_19b/new_edges.shp"
lion_gdf = gpd.read_file(shp_file)

[11]:

shp_file = r"../intermediate_data/nyclion_19b/new_edges.shp"
lion_graph = nx.read_shp(shp_file, simplify = True, geom_attrs = True, strict = True)

Convert to undirected graph¶

And create a nodes dataframe from graph

[12]:

G = lion_graph.to_undirected()
G = nx.convert_node_labels_to_integers(G, first_label = 0, label_attribute='pos')
pos = nx.get_node_attributes(G, 'pos')

[13]:

node_df = pd.DataFrame(pos.items(), columns=['node_id', 'geometry'])
node_df['geometry'] = node_df['geometry'].apply(Point)
node_gdf = gpd.GeoDataFrame(node_df, geometry='geometry')
node_gdf.crs = {'init': 'epsg:2263'}

Locating the nodes for each location¶

We use cDKTree to locate the nodes of the school and subways.

[14]:

# Create the tree
#coord_array = np.array(list(zip(geodataframe.geometry.x, geodataframe.geometry.y)) )
#tree = cKDTree(coord_array)

def ckdnearest2(point_gemo, tree, kneighbors):
    """
    This function read in a coordinate xy position, and a declared cdktree.
    The aim is to find the closest point to the coordinate xy position from the tree
    The function will return the distance between the closest point to the coordinate and the ID of that closest point.
    """
    coordinate_xy = (point_gemo.x,point_gemo.y)
    dist, idx = tree.query(coordinate_xy,k=kneighbors)
    if kneighbors == 1:
        dist = int(dist)
        idx = int(idx)
    return dist, idx

[15]:

coord_array = np.array(list(zip(node_gdf.geometry.x, node_gdf.geometry.y)) )
node_tree = cKDTree(coord_array)

[16]:

subway[['distance_to_node','node_id']] = subway.apply(lambda row: ckdnearest2( row['geometry'], node_tree, 1),
                                                     axis = 1,
                                                     result_type = 'expand')

[17]:

subway['distance_to_node'].value_counts()

[17]:

0    1928
Name: distance_to_node, dtype: int64

[18]:

subway.drop('distance_to_node', axis = 1, inplace = True)

[19]:

schools[['distance_to_node','node_id']] = schools.apply(lambda row: ckdnearest2( row['geometry'], node_tree, 1),
                                                        axis = 1,
                                                        result_type = 'expand')

[20]:

schools['distance_to_node'].value_counts()

[20]:

      1707
     1
     1
Name: distance_to_node, dtype: int64

There are two schools that are really far when they should be at a distance of zero from the nodes. Recall when I ran the connect_poi() function that it Remove faulty projections: 2/3637 (0.05%).

[22]:

fig, ax = plt.subplots(figsize=(10, 10))

#new_edges.plot(ax=ax)
schools.loc[schools['distance_to_node']>0].plot(ax=ax, color = 'red')

[22]:

<matplotlib.axes._subplots.AxesSubplot at 0x14a6824de48>

_images/Network_distance_example_61_1.png

Remove schools that are not in NYC

[23]:

schools = schools[schools['distance_to_node'] == 0]

[24]:

schools.drop('distance_to_node', axis = 1, inplace = True)

Finding the nearest 2 subway entrances¶

Now we want to find the closest 2 subway entrances to each school

Set up the tree for subway location

[25]:

coord_array = np.array(list(zip(subway.geometry.x, subway.geometry.y)) )
subway_tree = cKDTree(coord_array)

[26]:

schools[['distance_to_subway_nodes','subway_ids']] = schools.apply(lambda row: ckdnearest2( row['geometry'], subway_tree, 2),
                                                        axis = 1,
                                                        result_type = 'expand')

Parse out the subway_node_ids

[27]:

schools['closest_subway_id1'], schools['closest_subway_id2'] = zip(*schools['subway_ids'])

[28]:

schools.head()

[28]:

	ATS_CODE	BORO	BORONUM	LOC_CODE	SCHOOLNAME	SCH_TYPE	MANAGED_BY	GEO_DISTRI	ADMIN_DIST	ADDRESS	...	PRIN_PH	FAX	GRADES	City	geometry	node_id	distance_to_subway_nodes	subway_ids	closest_subway_id1	closest_subway_id2
0	15K001	K	2.0	K001	P.S. 001 THE BERGEN	Elementary	1	15	15	309 47 STREET	...	718-567-7661	718-567-9771	PK,0K,01,02,03,04,05,SE	BROOKLYN	POINT (980985.0541713729 175780.758994163)	109165	[574.0166423762518, 617.2474703341181]	[1744, 1746]	1744	1746
1	17K002	K	2.0	K002	M.S. 002	Junior High-Intermediate-Middle	1	17	17	655 PARKSIDE AVENUE	...	718-462-6992	718-284-7717	06,07,08,SE	BROOKLYN	POINT (997785.1364296663 178431.910832498)	109166	[258.5002370023575, 442.8355620846949]	[1349, 749]	1349	749
2	21K095	K	2.0	K095	P.S. 095 THE GRAVESEND	K-8	1	21	21	345 VAN SICKLEN STREET	...	718-449-5050	718-449-3047	PK,0K,01,02,03,04,05,06,07,08,SE	BROOKLYN	POINT (991224.9830063033 156323.3612131244)	109167	[431.5770408752747, 492.43651609913985]	[1522, 1591]	1522	1591
3	21K096	K	2.0	K096	I.S. 096 SETH LOW	Junior High-Intermediate-Middle	1	21	21	99 AVENUE P	...	718-236-1344	718-236-2397	06,07,08,SE	BROOKLYN	POINT (988439.6813609767 160709.9926236183)	109168	[1366.7287949334989, 1549.5096241139383]	[443, 203]	443	203
4	21K097	K	2.0	K097	P.S. 97 THE HIGHLAWN	Elementary	1	21	21	1855 STILLWELL AVENUE	...	718-372-7393	718-372-3842	PK,0K,01,02,03,04,05,SE	BROOKLYN	POINT (988205.0729096507 158329.5913366843)	109169	[1224.1544066200531, 1270.3197366749077]	[190, 189]	190	189

5 rows × 23 columns

[29]:

subway.head()

[29]:

	index	line	name	objectid	url	geometry	node_id
0	0	2-5	Birchall Ave & Sagamore St at NW corner	1734.0	http://web.mta.info/nyct/service/	POINT (1020670.668616476 248679.9905504963)	107248
1	1	2-5	Birchall Ave & Sagamore St at NE corner	1735.0	http://web.mta.info/nyct/service/	POINT (1020710.253511059 248665.112299423)	107249
2	2	2-5	Morris Park Ave & 180th St at NW corner	1736.0	http://web.mta.info/nyct/service/	POINT (1019251.986414479 245782.8697043241)	107250
3	3	2-5	Morris Park Ave & 180th St at NW corner	1737.0	http://web.mta.info/nyct/service/	POINT (1019419.817771734 245866.9102186682)	107251
4	4	2-5	Boston Rd & 178th St at SW corner	1738.0	http://web.mta.info/nyct/service/	POINT (1017557.721758398 245631.8322767556)	107252

Merging in the respective node_ids for the nearest subway locations

[30]:

schools = schools.merge(subway[['index','node_id']],
                        how = 'left',
                        left_on = 'closest_subway_id1',
                        right_on = 'index')

[31]:

schools.drop('index', axis = 1, inplace = True)
schools.rename(columns={"node_id_y": "subway_node_id1"},inplace =True)

[32]:

schools = schools.merge(subway[['index','node_id']],
                        how = 'left',
                        left_on = 'closest_subway_id2',
                        right_on = 'index')

[33]:

schools.drop('index', axis = 1, inplace = True)
schools.rename(columns={"node_id": "subway_node_id2"},inplace =True)

[34]:

schools.rename(columns={"node_id_x": "node_id"},inplace =True)

[35]:

def network_distance(graph, node_gdf, origin_node, target_node, weight_name):
    """"
    graph is the networkx graph
    node_gdf is the geodataframe of nodes. This will allow us to create the LineString.
    origin_node is the starting node to begin the network distance
    target_node is the end node to end the network distance
    weight is the distance measure

    We use the shortest_path function from networkx.
    Use the nodes on the shortest path and extract the coordinates from the node_gdf
    Use the geometry from the route_nodes to create the line string
    From the line string we will extract the length
    """

    try:
        route = nx.shortest_path(G=graph, source=origin_node, target=target_node, weight = weight_name)
        route_nodes = node_gdf.loc[route]
        route_line = LineString(list(route_nodes.geometry.values))
    except:
        route_line = LineString()

    route_line_dist = int(route_line.length)
    return route_line, route_line_dist

Network distance¶

[38]:

t0 = time.time()
schools[['route1_geom', 'route1_dist']] = schools.apply(
        lambda row: network_distance(G, node_gdf, row['node_id'], target_node = row['subway_node_id1'],weight_name = 'weight'),
        axis = 1,
        result_type = 'expand')

t1 = time.time()
time_difference = (((t1-t0)/60)/60)
print("It took {0} hours".format(time_difference))

It took 0.002848712139659458 hours

[39]:

t0 = time.time()
schools[['route2_geom', 'route2_dist']] = schools.apply(
        lambda row: network_distance(G, node_gdf, row['node_id'], target_node = row['subway_node_id2'],weight_name = 'weight'),
        axis = 1,
        result_type = 'expand')

t1 = time.time()
time_difference = (((t1-t0)/60)/60)
print("It took {0} hours".format(time_difference))

It took 0.0029577416181564334 hours

Determine which route is shorter between the two

[40]:

schools.loc[(schools['route1_dist'] <= schools['route2_dist']), 'shortest_network_distance'] = schools['route1_dist']
schools.loc[(schools['route1_dist'] > schools['route2_dist']) , 'shortest_network_distance'] = schools['route2_dist']

[41]:

schools.loc[(schools['route1_dist'] <= schools['route2_dist']), 'shortest_network_geom'] = schools['route1_geom']
schools.loc[(schools['route1_dist'] > schools['route2_dist']), 'shortest_network_geom'] = schools['route2_geom']

[43]:

schools['route1_dist'].isnull().value_counts()

[43]:

False    1707
Name: route1_dist, dtype: int64

[60]:

schools.columns

[60]:

Index(['ATS_CODE', 'BORO', 'BORONUM', 'LOC_CODE', 'SCHOOLNAME', 'SCH_TYPE',
       'MANAGED_BY', 'GEO_DISTRI', 'ADMIN_DIST', 'ADDRESS', 'STATE_CODE',
       'ZIP', 'PRINCIPAL', 'PRIN_PH', 'FAX', 'GRADES', 'City', 'geometry',
       'node_id', 'distance_to_subway_nodes', 'subway_ids',
       'closest_subway_id1', 'closest_subway_id2', 'subway_node_id1',
       'subway_node_id2', 'route1_geom', 'route1_dist', 'route2_geom',
       'route2_dist', 'shortest_network_distance', 'shortest_network_geom'],
      dtype='object')

[65]:

schools[['node_id','subway_node_id1','subway_node_id2','route1_geom','route1_dist','route2_geom','route2_dist','shortest_network_distance','shortest_network_geom']].loc[schools['shortest_network_distance'] == 0]

[65]:

	node_id	subway_node_id1	subway_node_id2	route1_geom	route1_dist	route2_geom	shortest_network_geom
993	110157	108875	107599	GEOMETRYCOLLECTION EMPTY	0	GEOMETRYCOLLECTION EMPTY	GEOMETRYCOLLECTION EMPTY
1018	110182	108875	107599	GEOMETRYCOLLECTION EMPTY	0	GEOMETRYCOLLECTION EMPTY	GEOMETRYCOLLECTION EMPTY
1042	110206	108871	107599	LINESTRING (929431.2955717515 137631.694279956...	4601	GEOMETRYCOLLECTION EMPTY	GEOMETRYCOLLECTION EMPTY
1613	110772	108750	107347	GEOMETRYCOLLECTION EMPTY	0	GEOMETRYCOLLECTION EMPTY	GEOMETRYCOLLECTION EMPTY

[61]:

fp = r"../intermediate_data/school_subway_route.shp"
schools = schools.set_geometry('shortest_network_geom')
schools[['node_id','shortest_network_geom']].to_file(fp)

[62]:

fp = r"../intermediate_data/school.shp"
schools = schools.set_geometry('geometry')
schools[['node_id','geometry']].to_file(fp)

[48]:

fp = r"../intermediate_data/subway.shp"
subway = subway.set_geometry('geometry')
subway['geometry'].to_file(fp)

Plot the schools, subway and routes¶

[44]:

fig, ax = plt.subplots(figsize=(15, 15))

schools.plot(ax=ax, color = 'blue', markersize = 9, label = 'school')
subway.plot(ax=ax, color = 'red', markersize = 9, label = 'subway')
schools = schools.set_geometry('shortest_network_geom')
schools.plot(ax=ax, color = 'black', linewidth=1, label = 'school routes')
#Set default back
schools = schools.set_geometry('geometry')

plt.title("Routes from NYC schools to nearest subway entrance")
plt.legend(loc='upper left')

ax.set_axis_off()
plt.show();

_images/Network_distance_example_94_0.png

I can see that there are 3 blue dots in Staten Island that should have a route drawn, but the networkx shortest path function did not find a path for these origin nodes. Need to figure out why. The other blue dot south of Manhattan is on Governors island.