23
loading...
This website collects cookies to deliver better user experience
FROM postgres:13.3
COPY pg.conf /etc/postgresql/postgresql.conf
COPY init.sql /docker-entrypoint-initdb.d/
# importing data using 'copy' command
COPY ./data/* /docker-entrypoint-initdb.d/
CMD ["-c", "config_file=/etc/postgresql/postgresql.conf"]
postgres:13.3
image from the official Postgres repository, and we are providing our own Postgres config, which is almost identical to the default one, with only a few adjustments in ./containers/postgres/pg.conf
:statement_timeout = 1200000 # in milliseconds (20 minutes) to allow COPY command to finish
max_wal_size = 3GB
./containers/postgres/data
folder under the following names:./containers/postgres/data/year_2014.csv
./containers/postgres/data/year_2015.csv
./containers/postgres/data/year_2016.csv
./containers/postgres/data/year_2017.csv
init.sql
file in the postgres
folder. There's an SQL definition of the table nyc_tickets
that will contain all our data. I will create a few indices on the table, to speed up the future queries:CREATE INDEX "IDX_tickets_vehicle_make" ON "nyc_tickets" ("vehicle_make");
CREATE INDEX "IDX_tickets_issue_date" ON "nyc_tickets" ("issue_date");
summons_number
as the primary key. However, in the input CSV files, there is a number of collisions and duplicates (i.e. multiple rows with identical summons_number
values).INSERT ... ON CONFLICT DO NOTHING;
CREATE RULE ... AS ON INSERT TO ... WHERE EXISTS ... DO INSTEAD NOTHING;
COPY
command won't allow that syntax. So instead, we'll apply the following trick:temp_tickets
that is identical to the original but doesn't contain any primary keys or indices.COPY
the data to that temporary table.nyc_tickets
using ON CONFLICT DO NOTHING
rule.COPY "temp_tickets" ("summons_number", "plate_id", "registration_state", "plate_type", "issue_date", "violation_code",
"vehicle_body_type", "vehicle_make", "issuing_agency", "street_code_1", "street_code_2",
"street_code_3", "vehicle_expiration_date", "violation_location", "violation_precinct",
"issuer_precinct", "issuer_code", "issuer_command", "issuer_squad", "violation_time",
"time_first_observed", "violation_county", "violation_in_front_of_or_opposite", "house_number",
"street_name", "intersecting_street", "date_first_observed", "law_section", "sub_division",
"violation_legal_code", "days_parking_in_effect", "from_hours_in_effect", "to_hours_in_effect",
"vehicle_color", "unregistered_vehicle", "vehicle_year", "meter_number", "feet_from_curb",
"violation_post_code", "violation_description", "no_standing_or_stopping_violation",
"hydrant_violation", "double_parking_violation")
FROM '/docker-entrypoint-initdb.d/year_2017.csv' DELIMITER ',' CSV HEADER;
init.sql
looks like:-- creating Metabase DB first
CREATE DATABASE "metabase";
GRANT ALL PRIVILEGES ON DATABASE "metabase" TO "analyst";
BEGIN;
GRANT ALL PRIVILEGES ON DATABASE "tickets" TO "analyst";
CREATE TABLE "nyc_tickets"
(
"summons_number" BIGINT NOT NULL,
"plate_id" TEXT,
"registration_state" TEXT,
"plate_type" TEXT,
"issue_date" DATE,
"violation_code" INTEGER,
"vehicle_body_type" TEXT,
"vehicle_make" TEXT,
"issuing_agency" TEXT,
"street_code_1" INTEGER,
"street_code_2" INTEGER,
"street_code_3" INTEGER,
"vehicle_expiration_date" TEXT,
"violation_location" TEXT,
"violation_precinct" INTEGER,
"issuer_precinct" INTEGER,
"issuer_code" INTEGER,
"issuer_command" TEXT,
"issuer_squad" TEXT,
"violation_time" TEXT,
"time_first_observed" TEXT,
"violation_county" TEXT,
"violation_in_front_of_or_opposite" TEXT,
"house_number" TEXT,
"street_name" TEXT,
"intersecting_street" TEXT,
"date_first_observed" TEXT,
"law_section" INTEGER,
"sub_division" TEXT,
"violation_legal_code" TEXT,
"days_parking_in_effect" TEXT,
"from_hours_in_effect" TEXT,
"to_hours_in_effect" TEXT,
"vehicle_color" TEXT,
"unregistered_vehicle" TEXT,
"vehicle_year" INTEGER,
"meter_number" TEXT,
"feet_from_curb" DECIMAL,
"violation_post_code" TEXT,
"violation_description" TEXT,
"no_standing_or_stopping_violation" TEXT,
"hydrant_violation" TEXT,
"double_parking_violation" TEXT,
"latitude" DECIMAL,
"longitude" DECIMAL,
"community_board" TEXT,
"community_council" TEXT,
"census_tract" TEXT,
"bin" TEXT,
"bbl" TEXT,
"nta" TEXT,
CONSTRAINT "PK_tickets" PRIMARY KEY ("summons_number")
);
CREATE INDEX "IDX_tickets_vehicle_make" ON "nyc_tickets" ("vehicle_make");
CREATE INDEX "IDX_tickets_issue_date" ON "nyc_tickets" ("issue_date");
COMMIT;
BEGIN;
CREATE TEMP TABLE "temp_tickets"
(
LIKE "nyc_tickets"
) ON COMMIT DROP;
COPY "temp_tickets" FROM '/docker-entrypoint-initdb.d/year_2014.csv' DELIMITER ',' CSV HEADER;
COPY "temp_tickets" FROM '/docker-entrypoint-initdb.d/year_2015.csv' DELIMITER ',' CSV HEADER;
COPY "temp_tickets" FROM '/docker-entrypoint-initdb.d/year_2016.csv' DELIMITER ',' CSV HEADER;
COPY "temp_tickets" ("summons_number", "plate_id", "registration_state", "plate_type", "issue_date", "violation_code",
"vehicle_body_type", "vehicle_make", "issuing_agency", "street_code_1", "street_code_2",
"street_code_3", "vehicle_expiration_date", "violation_location", "violation_precinct",
"issuer_precinct", "issuer_code", "issuer_command", "issuer_squad", "violation_time",
"time_first_observed", "violation_county", "violation_in_front_of_or_opposite", "house_number",
"street_name", "intersecting_street", "date_first_observed", "law_section", "sub_division",
"violation_legal_code", "days_parking_in_effect", "from_hours_in_effect", "to_hours_in_effect",
"vehicle_color", "unregistered_vehicle", "vehicle_year", "meter_number", "feet_from_curb",
"violation_post_code", "violation_description", "no_standing_or_stopping_violation",
"hydrant_violation", "double_parking_violation")
FROM '/docker-entrypoint-initdb.d/year_2017.csv' DELIMITER ',' CSV HEADER;
INSERT INTO "nyc_tickets"
SELECT *
FROM "temp_tickets"
ON CONFLICT DO NOTHING;
COMMIT;
FROM metabase/metabase:v0.39.4
version: "3.0"
services:
postgres:
build: ./postgres
ports:
- "5432:5432"
environment:
- POSTGRES_USER=analyst
- POSTGRES_PASSWORD=tickets
- POSTGRES_DB=tickets
volumes:
- pg-data:/var/lib/postgresql/data
metabase:
# Database env variables are set in environment specific env files in ../containers/metabase
build: ./metabase
ports:
- "3000:3000"
volumes:
pg-data:
cd path/to/project
docker-compose -f containers/compose.yml up
localhost:3000
and after a quick Metabase setup, start building our dashboard.SELECT COUNT(*) AS "vehicle_count", "vehicle_make"
FROM "nyc_tickets"
GROUP BY "vehicle_make"
ORDER BY "vehicle_count" DESC
SELECT COUNT(*) AS "vehicle_count", concat("vehicle_make", '_', "vehicle_year") as make_year
FROM "nyc_tickets"
GROUP BY "vehicle_make", "vehicle_year"
ORDER BY "vehicle_count" DESC
[date_trunc](https://www.postgresql.org/docs/13/functions-datetime.html#FUNCTIONS-DATETIME-TRUNC)
which truncates the input date to the specified date precision, in our case month
SELECT date_trunc('month', issue_date) AS "ticket_month", COUNT(*) AS "tickets_count"
FROM "nyc_tickets"
WHERE issue_date BETWEEN '2013-07-01 00:00:00+00' AND '2017-07-01 00:00:00+00'
GROUP BY "ticket_month"
ORDER BY "ticket_month" ASC
SELECT COUNT(*) AS "vehicle_count", concat(left("violation_time", 2), right("violation_time", 1)) as vtime
FROM "nyc_tickets"
GROUP BY vtime
ORDER BY "vehicle_count" DESC
SELECT to_char("issue_date", 'month') AS "ticket_month", COUNT(*) AS "tickets_count"
FROM "nyc_tickets"
WHERE issue_date BETWEEN '2013-07-01 00:00:00+00' AND '2017-07-01 00:00:00+00'
GROUP BY "ticket_month"
ORDER BY "ticket_month" DESC
SELECT COUNT(*) AS "vehicle_count", "violation_precinct"
FROM "nyc_tickets"
WHERE "violation_precinct" != 0
GROUP BY "violation_precinct"
ORDER BY "vehicle_count" DESC