29
loading...
This website collects cookies to deliver better user experience
scripts
directory of the COVID-19 geosearch demo repo. Feel free to clone it and test the finished demo. You'll need to create an .env
file from the example.env
and add your Algolia app information and API key before you run the scripts.39049,Franklin,Ohio,US,2021-08-31 04:21:46,39.96995815,-83.01115755,139178,1531,,,"Franklin, Ohio, US",10569.763874248532,1.0863785943180675
#!python3
import pandas
DATA_FILE = '../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/08-30-2021.csv'
def main():
df = pandas.read_csv(DATA_FILE)
print(df)
if __name__ == "__main__":
main()
#!python3
from pymongo import MongoClient
MDB_URL = "mongodb+srv://readonly:[email protected]/covid19"
def main():
client = MongoClient(MDB_URL)
db = client.get_database("covid19")
stats = db.get_collection("global_and_us")
metadata = db.get_collection("metadata")
# Get the last date loaded:
meta = metadata.find_one()
last_date = meta["last_date"]
results = stats.find(
{
"date":last_date,
}
print(results)
if __name__ == "__main__":
main()
results = stats.find(
{
"date":last_date,
"loc":{"$exists": True, "$ne": [] }
}, {
"combined_name": 1,
"county": 1,
"state": 1,
"country": 1,
"confirmed": 1,
"loc": 1
}
#!python3
import json
import requests
METADATA_URL = 'https://webhooks.mongodb-stitch.com/api/client/v2.0/app/covid-19-qppza/service/REST-API/incoming_webhook/metadata'
REST_URL = 'https://webhooks.mongodb-stitch.com/api/client/v2.0/app/covid-19-qppza/service/REST-API/incoming_webhook/global_and_us'
def main():
meta = requests.get(METADATA_URL)
last_date = meta.json()['last_date']
query = {
'min_date': last_date,
'max_date': last_date,
'hide_fields': '_id, fips, country_code, country_iso2, country_iso3, population, deaths, confirmed_daily, deaths_daily, recovered, recovered_daily'
}
response = requests.get(REST_URL, params=query)
print(response.json())
if __name__ == "__main__":
main()
hide_fields
parameter we can use as a negative filter to get just the fields we need. It’s not as straightforward as just listing the fields we want, but there might have been reasons for designing the API like that. With APIs, we must work with the interface we’re given. Fortunately, as we’ll see in the next section, this is a problem developers have been thinking about. #!python3
import json
import requests
GRAPHQL_AUTH = "https://realm.mongodb.com/api/client/v2.0/app/covid-19-qppza/auth/providers/anon-user/login"
GRAPHQL_URL = "https://realm.mongodb.com/api/client/v2.0/app/covid-19-qppza/graphql"
def main():
response = requests.get(GRAPHQL_AUTH)
access_token = response.json()['access_token']
headers = {}
headers["Accept"] = "application/json"
headers["Content-Type"] = "application/json"
headers["Authorization"] = "Bearer {}".format(access_token)
metadata = requests.post(GRAPHQL_URL, headers=headers, json={'query': 'query { metadatum{ last_date }}'})
if metadata.status_code != 200:
raise Exception(f"Query failed to run with a {response.status_code}.")
last_date = metadata.json()['data']['metadatum']['last_date']
query = '''query {
global_and_us(query: { date: "''' + last_date + '''" }, limit:5000)
{ confirmed county state country combined_name loc { type coordinates }}
}'''
response = requests.post(GRAPHQL_URL, headers=headers, json={'query': query})
if response.status_code != 200:
raise Exception(f"Query failed to run with a {response.status_code}.")
print(response.json())
if __name__ == "__main__":
main()
objectID
per record. It makes sense to map these IDs to an existing ID in our data source. The Algolia API can create these IDs for us, but we don't recommend this. Beyond these requirements, the goal here is to trim our records down to the simplest form that still provides the right balance between performant search and useful results.pandas
:covid_records = []
for index, row in df.iterrows():
# Skip locations w/o coordinates
if pandas.isna(row['Lat']):
print('Skipping {}: No geocode'.format(row['Combined_Key']))
else:
covid_record = {}
covid_geocode = {}
print(row['Combined_Key'])
covid_record['`objectId`'] = row['Combined_Key']
# Let's not use the combined key for US counties, instead let's use county and state
if pandas.isna(row['Admin2']):
covid_record['location'] = row['Combined_Key']
else:
covid_record['location'] = row['Admin2'] + ', ' + row['Province_State']
covid_record['country'] = row['Country_Region']
covid_record['confirmed_cases'] = int(row['Confirmed'])
covid_geocode['lat'] = row['Lat']
covid_geocode['lng'] = row['Long_']
covid_record['_geoloc'] = covid_geocode
covid_records.append(covid_record)
covid_records = []
for row in response.json():
# Unassigned and Unknown records are alread scrubbed in this DB
# Skip 'US' and 'Canada' since they have incomplete data
# and locations w/o coordinates
if row['combined_name'] != 'US' and row['combined_name'] != 'Canada' and 'loc' in row:
covid_record = {}
covid_geocode = {}
print(row['combined_name'])
covid_record['`objectId`'] = row['combined_name']
# Let's not use the combined key for US counties, instead let's use county and state
if 'county' in row:
covid_record['location'] = row['county'] + ', ' + row['state']
else:
covid_record['location'] = row['combined_name']
covid_record['country'] = row['country']
covid_record['confirmed_cases'] = row['confirmed']
covid_geocode['lat'] = row['loc']['coordinates'][1]
covid_geocode['lng'] = row['loc']['coordinates'][0]
covid_record['_geoloc'] = covid_geocode
covid_records.append(covid_record)
else:
print('Skipping {}: No geocode'.format(row['combined_name']))
client = SearchClient.create(os.getenv('APP_ID'), os.getenv('API_KEY'))
index = client.init_index(os.getenv('ALGOLIA_INDEX_NAME'))
index.clear_objects()
index.save_objects(covid_records)
country
and location
, and rank our results in descending order by the number of cases. High case counts tend to map to large population centers, which makes the data more useful when we zoom out on the map.{
...
"searchableAttributes": ["unordered(country)", "unordered(location)"],
"ranking": ["typo", "geo", "words", "filters", "proximity", "attribute", "exact", "custom"],
"customRanking": ["desc(confirmed_cases)"],
...
}
objectId
becomes critical to reference the existing records during updates.