ADEME Database Solutions
NULL analysis, PL/pgSQL functions, dynamic SQL
– Part B exploration – —————————————————————–
– CREATE OR REPLACE FUNCTION check_nulls( table_name text, column_name text ) RETURNS TABLE ( total_rows bigint, null_count bigint, null_percentage numeric ) AS \(BEGIN RETURN QUERY EXECUTE ' SELECT COUNT(*) as total_rows, COUNT(*) FILTER (WHERE ' || column_name || ' IS NULL) as null_count, ROUND(100.0 * COUNT(*) FILTER (WHERE ' || column_name || ' IS NULL) / COUNT(*), 2) as null_percentage FROM ' || table_name; END;\) LANGUAGE plpgsql;
– with type
CREATE OR REPLACE FUNCTION check_nulls( p_table_name text, p_column_name text ) RETURNS TABLE ( total_rows bigint, null_count bigint, null_percentage numeric ) AS $$ DECLARE v_column_type text; v_null_condition text; BEGIN – Get the data type of the column SELECT data_type INTO v_column_type FROM information_schema.columns WHERE table_name = p_table_name AND column_name = p_column_name;
-- Build the NULL condition based on data type
CASE
WHEN v_column_type IN ('text', 'character varying', 'char', 'varchar') THEN
v_null_condition := p_column_name || ' IS NULL OR ' || p_column_name || ' = ''''';
WHEN v_column_type IN ('integer', 'numeric', 'decimal', 'double precision', 'real', 'bigint') THEN
v_null_condition := p_column_name || ' IS NULL OR ' || p_column_name || ' = 0';
ELSE
v_null_condition := p_column_name || ' IS NULL';
END CASE;
-- Execute the query with the appropriate NULL condition
RETURN QUERY EXECUTE '
SELECT
COUNT(*) as total_rows,
COUNT(*) FILTER (WHERE ' || v_null_condition || ') as null_count,
ROUND(100.0 * COUNT(*) FILTER (WHERE ' || v_null_condition || ') / COUNT(*), 2) as null_percentage
FROM ' || p_table_name; END; $$ LANGUAGE plpgsql;
– CTE to loop over column names WITH nulls AS ( SELECT c.column_name, (check_nulls(‘dpe’, c.column_name)).* FROM information_schema.columns c WHERE c.table_name = ‘dpe’ ) SELECT * FROM nulls ORDER BY null_percentage DESC;
– normalization – —————————————————————– – Create the new normalized table with serial id CREATE TABLE energy_consumption ( id serial PRIMARY KEY, dpe_id integer REFERENCES dpe(id), priority smallint CHECK (priority IN (1, 2, 3)), measure_type text CHECK (measure_type IN (‘final’, ‘primary’)), consumption_value numeric );
– Create a unique constraint to prevent duplicates CREATE UNIQUE INDEX idx_energy_consumption_unique ON energy_consumption(dpe_id, priority, measure_type);
– Insert data using CTEs WITH – First prepare the final energy consumption data final_energy AS ( SELECT id as dpe_id, 1 as priority, ‘final’ as measure_type, final_energy_cons_01 as consumption_value FROM dpe WHERE final_energy_cons_01 > 0
UNION ALL
SELECT
id,
2,
'final',
final_energy_cons_02
FROM dpe
WHERE final_energy_cons_02 > 0
UNION ALL
SELECT
id,
3,
'final',
final_energy_cons_03
FROM dpe
WHERE final_energy_cons_03 >0 ),
– Then prepare the primary energy consumption data primary_energy AS ( SELECT id as dpe_id, 1 as priority, ‘primary’ as measure_type, primary_energy_cons_01 as consumption_value FROM dpe WHERE primary_energy_cons_01 > 0
UNION ALL
SELECT
id,
2,
'primary',
primary_energy_cons_02
FROM dpe
WHERE primary_energy_cons_02 > 0
UNION ALL
SELECT
id,
3,
'primary',
primary_energy_cons_03
FROM dpe
WHERE primary_energy_cons_03 > 0 )
– Combine both CTEs and insert into the new table INSERT INTO energy_consumption (dpe_id, priority, measure_type, consumption_value) SELECT * FROM final_energy UNION ALL SELECT * FROM primary_energy;
– Create indexes to optimize common queries CREATE INDEX idx_energy_consumption_dpe_id ON energy_consumption(dpe_id); CREATE INDEX idx_energy_consumption_measure ON energy_consumption(measure_type);
– Example queries to link/reconcile with main dpe table – Query 1: Get all energy measurements for a specific building SELECT d.dpe_number, d.city_name_ban, d.construction_year, ec.priority, ec.measure_type, ec.consumption_value FROM dpe d JOIN energy_consumption ec ON d.id = ec.dpe_id WHERE d.id = 1 ORDER BY ec.priority, ec.measure_type;
– Query 2: Compare final vs primary energy consumption SELECT d.dpe_number, d.city_name_ban, MAX(CASE WHEN ec.measure_type = ‘final’ THEN ec.consumption_value END) as final_energy, MAX(CASE WHEN ec.measure_type = ‘primary’ THEN ec.consumption_value END) as primary_energy FROM dpe d JOIN energy_consumption ec ON d.id = ec.dpe_id WHERE ec.priority = 1 – Only priority 1 measurements GROUP BY d.id, d.dpe_number, d.city_name_ban LIMIT 5;
– Query 3: Get buildings with missing energy measurements SELECT d.id, d.dpe_number, d.city_name_ban FROM dpe d LEFT JOIN energy_consumption ec ON d.id = ec.dpe_id WHERE ec.id IS NULL;
– solution
– Part 1: Schema Design – Task 1.1 Solution: Normalization Issues /* Identified issues:
- Repeating columns (final_energy_cons_01, 02, 03) violate 1NF
- Mixing of measure types (final/primary) in parallel columns
- No clear way to add more priorities without adding columns
- Difficult to query across priorities or measure types */
– Task 1.2 Solution: Correct answer is B CREATE TABLE energy_consumption ( id serial PRIMARY KEY, dpe_id integer REFERENCES dpe(id), priority smallint CHECK (priority IN (1, 2, 3)), measure_type text CHECK (measure_type IN (‘final’, ‘primary’)), consumption_value numeric );
– Create supporting indexes CREATE UNIQUE INDEX idx_energy_consumption_unique ON energy_consumption(dpe_id, priority, measure_type); CREATE INDEX idx_energy_consumption_dpe_id ON energy_consumption(dpe_id); CREATE INDEX idx_energy_consumption_measure ON energy_consumption(measure_type);
– Part 2: Data Transformation – Task 2.1 & 2.2 Solution: Complete Transformation Query WITH final_energy AS ( – Priority 1 final energy SELECT id as dpe_id, 1 as priority, ‘final’ as measure_type, final_energy_cons_01 as consumption_value FROM dpe WHERE final_energy_cons_01 IS NOT NULL
UNION ALL
-- Priority 2 final energy
SELECT
id,
2,
'final',
final_energy_cons_02
FROM dpe
WHERE final_energy_cons_02 IS NOT NULL
UNION ALL
-- Priority 3 final energy
SELECT
id,
3,
'final',
final_energy_cons_03
FROM dpe
WHERE final_energy_cons_03 IS NOT NULL ), primary_energy AS (
-- Priority 1 primary energy
SELECT
id as dpe_id,
1 as priority,
'primary' as measure_type,
primary_energy_cons_01 as consumption_value
FROM dpe
WHERE primary_energy_cons_01 IS NOT NULL
UNION ALL
-- Priority 2 primary energy
SELECT
id,
2,
'primary',
primary_energy_cons_02
FROM dpe
WHERE primary_energy_cons_02 IS NOT NULL
UNION ALL
-- Priority 3 primary energy
SELECT
id,
3,
'primary',
primary_energy_cons_03
FROM dpe
WHERE primary_energy_cons_03 IS NOT NULL ) -- Insert transformed data INSERT INTO energy_consumption (dpe_id, priority, measure_type, consumption_value) SELECT * FROM final_energy UNION ALL SELECT * FROM primary_energy;
– Part 3: Implementation – Task 3.1: Validation Queries – 1. Check distribution of measure types SELECT measure_type, COUNT(*) as count FROM energy_consumption GROUP BY measure_type;
– 2. Check priorities distribution SELECT priority, measure_type, COUNT(*) as count FROM energy_consumption GROUP BY priority, measure_type ORDER BY priority, measure_type;
– 3. Verify no duplicates SELECT dpe_id, priority, measure_type, COUNT() FROM energy_consumption GROUP BY dpe_id, priority, measure_type HAVING COUNT() > 1;
– Task 3.2: Data Validation – 1. Compare original vs transformed for specific dpe_id WITH original_data AS ( SELECT id, final_energy_cons_01, primary_energy_cons_01 FROM dpe WHERE id = 1 ) SELECT o.id, o.final_energy_cons_01 as original_final, MAX(CASE WHEN e.measure_type = ‘final’ AND e.priority = 1 THEN e.consumption_value END) as transformed_final, o.primary_energy_cons_01 as original_primary, MAX(CASE WHEN e.measure_type = ‘primary’ AND e.priority = 1 THEN e.consumption_value END) as transformed_primary FROM original_data o LEFT JOIN energy_consumption e ON o.id = e.dpe_id GROUP BY o.id, o.final_energy_cons_01, o.primary_energy_cons_01;
– 2. Check referential integrity SELECT COUNT(*) as orphaned_records FROM energy_consumption ec LEFT JOIN dpe d ON ec.dpe_id = d.id WHERE d.id IS NULL;
– Part 4: Analysis Queries – Task 4.1: Average consumption by measure type SELECT measure_type, ROUND(AVG(consumption_value)::numeric, 2) as avg_consumption, COUNT(*) as measurement_count FROM energy_consumption WHERE priority = 1 GROUP BY measure_type;
– Task 4.2: Buildings with primary > 2x final energy WITH consumption_pairs AS ( SELECT dpe_id, priority, MAX(CASE WHEN measure_type = ‘final’ THEN consumption_value END) as final_consumption, MAX(CASE WHEN measure_type = ‘primary’ THEN consumption_value END) as primary_consumption FROM energy_consumption GROUP BY dpe_id, priority ) SELECT cp.dpe_id, d.city_name_ban, cp.priority, cp.final_consumption, cp.primary_consumption, ROUND((cp.primary_consumption / cp.final_consumption)::numeric, 2) as ratio FROM consumption_pairs cp JOIN dpe d ON cp.dpe_id = d.id WHERE cp.primary_consumption > cp.final_consumption * 2 AND cp.final_consumption IS NOT NULL AND cp.primary_consumption IS NOT NULL ORDER BY ratio DESC LIMIT 10;
– Bonus Challenge Solution: JSON Energy Profile Function CREATE OR REPLACE FUNCTION get_building_energy_profile(p_dpe_id integer) RETURNS jsonb AS \(WITH consumption_data AS ( SELECT priority, jsonb_build_object( 'final', MAX(CASE WHEN measure_type = 'final' THEN consumption_value END), 'primary', MAX(CASE WHEN measure_type = 'primary' THEN consumption_value END) ) as measurements FROM energy_consumption WHERE dpe_id = p_dpe_id GROUP BY priority ) SELECT jsonb_build_object( 'dpe_id', p_dpe_id, 'measurements', jsonb_object_agg(priority, measurements) ) FROM consumption_data;\) LANGUAGE SQL;
– Example usage: SELECT get_building_energy_profile(1);
other energy tables
– Create the table with cleaner column names CREATE TABLE energy_use ( id serial PRIMARY KEY, dpe_id integer REFERENCES dpe(id), priority smallint CHECK (priority IN (1, 2, 3)), type text, usage text, cost numeric, reading_year numeric,
UNIQUE (dpe_id, priority) );
CREATE INDEX idx_energy_use_dpe_id ON energy_use(dpe_id);
– Insert data using CTE WITH energy_data AS ( – Priority 1 records SELECT id as dpe_id, 1 as priority, energy_type_01 as type, energy_usage_type_01 as usage, annual_energy_cost_01 as cost, energy_reading_year_01 as reading_year FROM dpe WHERE energy_type_01 IS NOT NULL OR energy_usage_type_01 IS NOT NULL OR annual_energy_cost_01 >0 OR energy_reading_year_01 >0
UNION ALL
-- Priority 2 records
SELECT
id as dpe_id,
2 as priority,
energy_type_02,
energy_usage_type_02,
annual_energy_cost_02,
energy_reading_year_02
FROM dpe
WHERE energy_type_02 IS NOT NULL
OR energy_usage_type_02 IS NOT NULL
OR annual_energy_cost_02 >0
OR energy_reading_year_02 >0
UNION ALL
-- Priority 3 records
SELECT
id as dpe_id,
3 as priority,
energy_type_03,
energy_usage_type_03,
annual_energy_cost_03,
energy_reading_year_03
FROM dpe
WHERE energy_type_03 IS NOT NULL
OR energy_usage_type_02 IS NOT NULL
OR annual_energy_cost_03 >0
OR energy_reading_year_03 >0 ) INSERT INTO energy_use (
dpe_id,
priority,
type,
usage,
cost,
reading_year ) SELECT * FROM energy_data;
ban
– First create the ban table CREATE TABLE ban ( id serial PRIMARY KEY, dpe_id integer REFERENCES dpe(id) UNIQUE, – one-to-one relationship city_name text, insee_code text, street_number text, ban_id text, address text, postal_code integer, score numeric, street_name text, x_coordinate numeric, y_coordinate numeric, department_number text, region_number integer );
– Create an index for the foreign key CREATE INDEX idx_ban_dpe_id ON ban(dpe_id);
– Insert data from dpe table INSERT INTO ban ( dpe_id, city_name, insee_code, street_number, ban_id, address, postal_code, score, street_name, x_coordinate, y_coordinate, department_number, region_number ) SELECT id, city_name_ban, insee_code_ban, street_number_ban, ban_id, address_ban, postal_code_ban, ban_score, street_name_ban, x_coordinate_ban, y_coordinate_ban, department_number_ban, region_number_ban FROM dpe;
– Drop the columns from dpe table ALTER TABLE dpe DROP COLUMN city_name_ban, DROP COLUMN insee_code_ban, DROP COLUMN street_number_ban, DROP COLUMN ban_id, DROP COLUMN address_ban, DROP COLUMN postal_code_ban, DROP COLUMN ban_score, DROP COLUMN street_name_ban, DROP COLUMN x_coordinate_ban, DROP COLUMN y_coordinate_ban, DROP COLUMN department_number_ban, DROP COLUMN region_number_ban;
– Verify the migration SELECT COUNT(*) FROM ban;
– Sample query to verify the join works SELECT d.dpe_number, b.city_name, b.address, b.postal_code FROM dpe d JOIN ban b ON d.id = b.dpe_id LIMIT 5;
verify
– Check for any orphaned records SELECT COUNT(*) FROM dpe d LEFT JOIN ban b ON d.id = b.dpe_id WHERE b.id IS NULL;
– Check data completeness SELECT COUNT(*) as total_records, COUNT(city_name) as with_city, COUNT(postal_code) as with_postal, COUNT(ban_id) as with_ban_id FROM ban;
ERD from pgadmin
– This script was generated by the ERD tool in pgAdmin 4. – Please log an issue at https://github.com/pgadmin-org/pgadmin4/issues/new/choose if you find any bugs, including reproduction steps. BEGIN;
CREATE TABLE IF NOT EXISTS public.ban ( id serial NOT NULL, dpe_id integer, city_name text COLLATE pg_catalog.”default”, insee_code text COLLATE pg_catalog.”default”, street_number text COLLATE pg_catalog.”default”, ban_id text COLLATE pg_catalog.”default”, address text COLLATE pg_catalog.”default”, postal_code integer, score numeric, street_name text COLLATE pg_catalog.”default”, x_coordinate numeric, y_coordinate numeric, department_number text COLLATE pg_catalog.”default”, region_number integer, CONSTRAINT ban_pkey PRIMARY KEY (id), CONSTRAINT ban_dpe_id_key UNIQUE (dpe_id) );
CREATE TABLE IF NOT EXISTS public.dpe ( id serial NOT NULL, dpe_number text COLLATE pg_catalog.”default”, dpe_reception_date date, dpe_issue_date date, inspector_visit_date date, dpe_expiry_date date, dpe_method text COLLATE pg_catalog.”default”, dpe_label text COLLATE pg_catalog.”default”, ghg_label text COLLATE pg_catalog.”default”, energy_use_kwhepm2year numeric, ghg_emissions_kgco2m2year numeric, construction_year integer, erp_category text COLLATE pg_catalog.”default”, construction_period text COLLATE pg_catalog.”default”, activity_sector text COLLATE pg_catalog.”default”, occupant_count integer, gross_floor_area numeric, usable_area numeric, main_heating_energy_type text COLLATE pg_catalog.”default”, apartment_floor integer, residence_name text COLLATE pg_catalog.”default”, building_address_details text COLLATE pg_catalog.”default”, housing_address_details text COLLATE pg_catalog.”default”, geocoding_status text COLLATE pg_catalog.”default”, CONSTRAINT dpe_pkey PRIMARY KEY (id) );
CREATE TABLE IF NOT EXISTS public.energy_consumption ( id serial NOT NULL, dpe_id integer, priority smallint, measure_type text COLLATE pg_catalog.”default”, consumption_value numeric, CONSTRAINT energy_consumption_pkey PRIMARY KEY (id) );
CREATE TABLE IF NOT EXISTS public.energy_labels ( grade character(1) COLLATE pg_catalog.”default” NOT NULL, min_cons numeric, max_cons numeric, emission_min numeric, emission_max numeric, CONSTRAINT energy_labels_pkey PRIMARY KEY (grade) );
CREATE TABLE IF NOT EXISTS public.energy_use ( id serial NOT NULL, dpe_id integer, priority smallint, type text COLLATE pg_catalog.”default”, usage text COLLATE pg_catalog.”default”, cost numeric, reading_year numeric, CONSTRAINT energy_use_pkey PRIMARY KEY (id), CONSTRAINT energy_use_dpe_id_priority_key UNIQUE (dpe_id, priority) );
ALTER TABLE IF EXISTS public.ban ADD CONSTRAINT ban_dpe_id_fkey FOREIGN KEY (dpe_id) REFERENCES public.dpe (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION; CREATE INDEX IF NOT EXISTS idx_ban_dpe_id ON public.ban(dpe_id);
ALTER TABLE IF EXISTS public.energy_consumption ADD CONSTRAINT energy_consumption_dpe_id_fkey FOREIGN KEY (dpe_id) REFERENCES public.dpe (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION;
ALTER TABLE IF EXISTS public.energy_use ADD CONSTRAINT energy_use_dpe_id_fkey FOREIGN KEY (dpe_id) REFERENCES public.dpe (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION;
END;
Index Optimization Exercise - Solution Guide
Part 1: Analyzing Building Energy Efficiency by Region
Initial Query Analysis
Running EXPLAIN ANALYZE on the initial query:
EXPLAIN ANALYZE
SELECT
b.region_number,
COUNT(*) as building_count,
ROUND(AVG(d.energy_use_kwhepm2year), 2) as avg_energy_use,
ROUND(AVG(d.ghg_emissions_kgco2m2year), 2) as avg_emissions
FROM dpe d
JOIN ban b ON d.id = b.dpe_id
WHERE d.construction_year > 2000
AND d.dpe_label IN ('A', 'B')
GROUP BY b.region_number
ORDER BY avg_energy_use ASC;
Expected initial execution plan:
GroupAggregate (cost=847.84..848.10 rows=13 width=20) (actual time=25.543..25.546 rows=13 loops=1)
Group Key: b.region_number
-> Sort (cost=847.84..847.87 rows=13 width=20) (actual time=25.531..25.534 rows=156 loops=1)
Sort Key: b.region_number
Sort Method: quicksort Memory: 28kB
-> Hash Join (cost=226.00..847.63 rows=13 width=20) (actual time=3.959..25.497 rows=156 loops=1)
Hash Cond: (b.dpe_id = d.id)
-> Seq Scan on ban b (cost=0.00..539.08 rows=5608 width=8) (actual time=0.007..2.914 rows=5608 loops=1)
-> Hash (cost=225.88..225.88 rows=10 width=16) (actual time=3.941..3.942 rows=156 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 17kB
-> Seq Scan on dpe d (cost=0.00..225.88 rows=10 width=16) (actual time=0.437..3.885 rows=156 loops=1)
Filter: ((construction_year > 2000) AND (dpe_label = ANY ('{A,B}'::text[])))
Rows Removed by Filter: 5452
Solution 1.2: Index Creation
We need indexes for:
- The join condition
- The WHERE clause conditions
- The ORDER BY clause (though in this case, it’s on an aggregated value so an index won’t help)
-- Index for the join condition (already exists as per document 3)
-- CREATE INDEX idx_ban_dpe_id ON ban(dpe_id);
-- Index for the WHERE conditions
CREATE INDEX idx_dpe_year_label ON dpe(construction_year, dpe_label);
Explanation of index choices:
- B-tree index is chosen over hash because:
- We have a range condition (
construction_year > 2000
) - The
dpe_label
comparison usesIN
which can benefit from B-tree’s ordering
- We have a range condition (
- We create a compound index with
construction_year
first because it’s used in a range condition - The existing
idx_ban_dpe_id
index will help with the join operation
After Index Creation
Running EXPLAIN ANALYZE again:
-- Same query with new indexes
Expected execution plan with indexes:
GroupAggregate (cost=147.84..148.10 rows=13 width=20) (actual time=5.543..5.546 rows=13 loops=1)
Group Key: b.region_number
-> Sort (cost=147.84..147.87 rows=13 width=20) (actual time=5.531..5.534 rows=156 loops=1)
Sort Key: b.region_number
Sort Method: quicksort Memory: 28kB
-> Hash Join (cost=26.00..147.63 rows=13 width=20) (actual time=0.959..5.497 rows=156 loops=1)
Hash Cond: (b.dpe_id = d.id)
-> Index Scan using idx_ban_dpe_id on ban b (cost=0.00..39.08 rows=5608 width=8)
-> Hash (cost=25.88..25.88 rows=10 width=16)
-> Index Scan using idx_dpe_year_label on dpe d (cost=0.00..25.88 rows=10 width=16)
Index Cond: (construction_year > 2000)
Filter: (dpe_label = ANY ('{A,B}'::text[])))
Performance improvement:
- Original query: ~25.5ms
- Optimized query: ~5.5ms
- Improvement: ~78%
Part 2: Address Search Optimization
Initial Query Analysis
Running EXPLAIN ANALYZE on the spatial search query:
EXPLAIN ANALYZE
SELECT
b2.address,
d2.dpe_label,
d2.energy_use_kwhepm2year,
ROUND(
SQRT(POW(b1.x_coordinate - b2.x_coordinate, 2) +
POW(b1.y_coordinate - b2.y_coordinate, 2))::numeric, 2
) as distance_km
FROM ban b1
JOIN dpe d1 ON b1.dpe_id = d1.id
JOIN ban b2 ON b1.id != b2.id
JOIN dpe d2 ON b2.dpe_id = d2.id
WHERE b1.address = '15 RUE DE LA PAIX PARIS'
AND d2.dpe_label = d1.dpe_label
AND SQRT(POW(b1.x_coordinate - b2.x_coordinate, 2) +
POW(b1.y_coordinate - b2.y_coordinate, 2)) <= 1
ORDER BY distance_km;
Task 2.1 Solutions
- Correct answer: d) All of the above
- The query performs multiple joins
- Calculates distances for potentially all pairs of buildings
- Lacks indexes on crucial columns
- Highest cost operations:
- Sequential scans on ban table for address matching
- Distance calculations for all building pairs
- Joins without proper index support
Solution 2.2: Index Creation
Required indexes:
-- Index for address lookup
CREATE INDEX idx_ban_address ON ban(address);
-- Compound index for coordinates
CREATE INDEX idx_ban_coordinates ON ban(x_coordinate, y_coordinate);
-- Index for DPE label matching
CREATE INDEX idx_dpe_label ON dpe(dpe_label);
Explanation of index choices:
idx_ban_address
: B-tree index for exact address matchingidx_ban_coordinates
: B-tree index for coordinate lookupsidx_dpe_label
: B-tree index for label matching
Performance comparison:
- Original execution time: ~500-1000ms (varies with data volume)
- Optimized execution time: ~50-100ms
- Improvement: 90%+ reduction in query time
The improved execution plan will show:
- Index scan on ban using address index
- Index scan on coordinates
- Nested loop join instead of hash join for coordinate comparison
Common Student Mistakes to Watch For
- Creating single-column indexes when compound indexes would be better
- Creating hash indexes for range conditions
- Creating indexes on high-cardinality columns that won’t be frequently queried
- Not considering existing indexes (like the PK indexes)
- Creating redundant indexes that cover the same columns in different orders
Advanced Optimization Notes
For production environments, additional optimizations could include:
- Creating a spatial index (PostGIS extension) for better distance queries
- Partial indexes for frequently accessed subsets of data
- Including columns in indexes to create covering indexes
- Using materialized views for frequently accessed aggregate data
Query Pattern Analysis
The exercise demonstrates several common query patterns:
- Range queries with equality conditions
- Spatial proximity searches
- Self-joins
- Aggregate queries with grouping
Each pattern benefits from different indexing strategies:
- Range queries → B-tree compound indexes
- Exact matches → Hash or B-tree indexes
- Spatial queries → Spatial indexes or compound B-tree indexes
- Join conditions → B-tree indexes on join columns