Recently while updating business information in my area, I noticed that outdated POIs would often include a defunct website. When attempting to follow the links in OSM, I was just presented with an error. That gave me the idea that I could proactively identify POIs that may need validated in person.
After some tinkering, I came up with a python script that will query OSM data for nodes with websites, then iterate over the websites to see which return errors when connecting. I started with logging these to a text file, but then realized that I could export them all to a kmz file and import them into Organic Maps. Now when I’m out and about I can launch Organic Maps to find areas that I should validate.
When you select a pin, it’ll give you more details on the failure.
While testing I noticed that there are also many sites getting errors indicating that the specific page isn’t found or that the page can be found at a new location. I believe that these can be validated without surveying, so I set up a separate mode called “armchair” mode that highlight these errors.
Setup
NOTE: This will initiate connections from your machine to whatever websites exist in OSM
To use the script, copy the below file and save it as “broken-websites.py”. Before running I believe you’ll need to install the required packages by running the below commands in your terminal:
pip3 install simplekml
pip3 install overpy
pip3 install geopy
From there you can change that latitude, longitude, and radius. Keep in mind that a larger radius takes exponentially longer. You can commands like:
# List all websites that return a 5XX error to a text file
python3 ./broken-websites.py survey txt
# Generate a kmz of all websites that redirect or return 404
python3 ./broken-websites.py airchair kmz
# Generate both list types to both file types
python3 ./broken-websites.py both both
Script:
import overpy
from geopy.distance import geodesic
import requests
import datetime
import simplekml
import sys
latitude = 52.377956
longitude = 4.897070
radius = .5 # Default radius in kilometers
def check_website(url):
try:
response = requests.head(url, timeout=10)
return response.status_code, response.reason
except Exception as e:
return None, str(e)
def get_amenities():
api = overpy.Overpass()
# Define bounding box
lat_min = latitude - (radius * 0.009)
lat_max = latitude + (radius * 0.009)
lon_min = longitude - (radius * 0.009)
lon_max = longitude + (radius * 0.009)
query_str = f"""
node["website"]
({lat_min},{lon_min},{lat_max},{lon_max});
out;
"""
result = api.query(query_str)
amenities = []
for node in result.nodes:
name = node.tags.get("name", "Unknown")
amenity_type = node.tags.get("amenity", "Unknown")
lat = float(node.lat)
lon = float(node.lon)
website = node.tags.get("website", "N/A")
distance = geodesic((latitude, longitude), (lat, lon)).kilometers
osm_link = f"https://www.openstreetmap.org/node/{node.id}"
if distance <= radius:
amenities.append({"name": name, "amenity_type": amenity_type, "latitude": lat, "longitude": lon, "website": website, "distance": distance, "osm_link": osm_link})
return amenities
def save_to_kmz(amenities_with_broken_websites, mode, timestamp):
kml = simplekml.Kml()
icon_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ec/Red_dot.svg' # Privacy-friendly icon URL
for amenity in amenities_with_broken_websites:
description = (f"<p>Website: <a href='{amenity['website']}' target='_blank'>{amenity['website']}</a></p>"
f"<p>{amenity['error_message']}</p>"
f"<p><a href='{amenity['osm_link']}' target='_blank'>Link to OSM</a></p>")
placemark = kml.newpoint(name=amenity['name'],
description=description,
coords=[(amenity['longitude'], amenity['latitude'])])
placemark.style.iconstyle.icon.href = icon_url
file_name = (f"Broken_Websites_{mode}_Radius_{radius}km_Lat_{latitude}_Long_{longitude}_{timestamp}.kmz")
kml.savekmz(file_name)
print(f"KMZ file saved as: {file_name}")
def save_to_txt(amenities_with_broken_websites, mode, timestamp):
file_name = (f"Broken_Websites_{mode}_Radius_{radius}km_Lat_{latitude}_Long_{longitude}_{timestamp}.txt")
with open(file_name, 'w') as f:
for amenity in amenities_with_broken_websites:
f.write(f"Name: {amenity['name']}\n")
f.write(f"Amenity Type: {amenity['amenity_type']}\n")
f.write(f"Latitude: {amenity['latitude']}\n")
f.write(f"Longitude: {amenity['longitude']}\n")
f.write(f"Website: {amenity['website']}\n")
f.write(f"Distance: {amenity['distance']:.2f} km\n")
f.write(f"Error: {amenity['error_message']}\n")
f.write(f"OpenStreetMap: {amenity['osm_link']}\n")
f.write("\n")
print(f"Text file saved as: {file_name}")
def filter_amenities(amenities):
filtered_amenities = {"survey": [], "armchair": []}
for amenity in amenities:
status_code, error_message = check_website(amenity['website'])
if not status_code or 500 <= status_code < 600:
amenity['error_message'] = f"Status Code: {status_code}, Error: {error_message}"
filtered_amenities["survey"].append(amenity)
if status_code and (300 <= status_code < 400 or status_code == 404):
amenity['error_message'] = f"Status Code: {status_code}, Error: {error_message}"
filtered_amenities["armchair"].append(amenity)
return filtered_amenities
def main():
if len(sys.argv) != 3:
print("Usage: python script.py <mode> <output>")
print("Mode should be one of: survey, armchair, both")
print("Output should be one of: txt, kmz, both")
return
mode = sys.argv[1]
output = sys.argv[2]
if mode not in ["survey", "armchair", "both"]:
print("Invalid mode. Mode should be one of: survey, armchair, both")
return
if output not in ["txt", "kmz", "both"]:
print("Invalid output. Output should be one of: txt, kmz, both")
return
print(f"Searching for amenities with websites within {radius} km...")
amenities = get_amenities()
if amenities:
print("\nChecking websites...")
filtered_amenities = filter_amenities(amenities)
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
if mode in ["survey", "both"]:
if filtered_amenities["survey"]:
# print(f"Found {len(filtered_amenities["survey"])} broken websites")
if output in ["kmz", "both"]:
save_to_kmz(filtered_amenities["survey"], "survey", timestamp)
if output in ["txt", "both"]:
save_to_txt(filtered_amenities["survey"], "survey", timestamp)
else:
print("No amenities found with broken websites in survey mode.")
if mode in ["armchair", "both"]:
if filtered_amenities["armchair"]:
# print(f"Found {len(filtered_amenities["armchair"])} broken websites")
if output in ["kmz", "both"]:
save_to_kmz(filtered_amenities["armchair"], "armchair", timestamp)
if output in ["txt", "both"]:
save_to_txt(filtered_amenities["armchair"], "armchair", timestamp)
else:
print("No amenities found with broken websites in armchair mode.")
else:
print("No amenities found within the specified radius or with specified website.")
if __name__ == "__main__":
main()