SQL Fundamentals: Queries, Joins, and Optimization
SQL Fundamentals: Queries, Joins, and Optimization
SQL (Structured Query Language) is essential for working with relational databases. Understanding SQL deeply leads to better application performance.
Basic CRUD Operations
SELECT - Reading Data
-- Select all columns
SELECT * FROM users;
-- Select specific columns
SELECT id, name, email FROM users;
-- Aliases
SELECT id AS user_id, name AS full_name FROM users;
-- DISTINCT - Remove duplicates
SELECT DISTINCT country FROM users;
-- LIMIT - Pagination
SELECT * FROM users LIMIT 10;
SELECT * FROM users LIMIT 10 OFFSET 20; -- Page 3 with 10 per page
-- Ordering
SELECT * FROM users ORDER BY created_at DESC;
SELECT * FROM users ORDER BY name ASC, age DESC;
WHERE - Filtering
-- Simple conditions
SELECT * FROM users WHERE age > 18;
SELECT * FROM users WHERE status = 'active';
-- Multiple conditions
SELECT * FROM users WHERE age > 18 AND country = 'USA';
SELECT * FROM users WHERE status = 'active' OR status = 'pending';
SELECT * FROM users WHERE NOT status = 'deleted';
-- IN operator
SELECT * FROM users WHERE id IN (1, 2, 3, 4, 5);
SELECT * FROM users WHERE country IN ('USA', 'Canada', 'Mexico');
-- BETWEEN
SELECT * FROM users WHERE age BETWEEN 18 AND 65;
SELECT * FROM users WHERE created_at BETWEEN '2024-01-01' AND '2024-12-31';
-- LIKE - Pattern matching
SELECT * FROM users WHERE name LIKE 'John%'; -- Starts with John
SELECT * FROM users WHERE name LIKE '%Smith'; -- Ends with Smith
SELECT * FROM users WHERE name LIKE '%oh%'; -- Contains oh
-- IS NULL
SELECT * FROM users WHERE phone IS NULL;
SELECT * FROM users WHERE phone IS NOT NULL;
INSERT - Creating Data
-- Insert single row
INSERT INTO users (name, email, age) VALUES ('John', 'john@example.com', 30);
-- Insert multiple rows
INSERT INTO users (name, email, age) VALUES
('Alice', 'alice@example.com', 25),
('Bob', 'bob@example.com', 35),
('Charlie', 'charlie@example.com', 28);
-- Insert with default values
INSERT INTO users (name, email) VALUES ('Jane', 'jane@example.com');
-- age gets default value
-- Insert from SELECT
INSERT INTO users_archive (name, email, age)
SELECT name, email, age FROM users WHERE created_at < '2020-01-01';
UPDATE - Modifying Data
-- Update single row
UPDATE users SET email = 'newemail@example.com' WHERE id = 1;
-- Update multiple columns
UPDATE users SET status = 'inactive', updated_at = NOW() WHERE last_login < '2023-01-01';
-- Update with expressions
UPDATE products SET price = price * 1.1; -- 10% price increase
UPDATE users SET age = age + 1 WHERE birthday_today = true;
-- Conditional update
UPDATE orders SET status = 'shipped' WHERE status = 'processing' AND created_at < NOW() - INTERVAL '1 day';
DELETE - Removing Data
-- Delete single row
DELETE FROM users WHERE id = 1;
-- Delete multiple rows
DELETE FROM users WHERE status = 'deleted';
-- Delete all (be careful!)
DELETE FROM users;
-- Safe deletion pattern
DELETE FROM users WHERE id IN (SELECT id FROM users WHERE status = 'inactive' LIMIT 100);
Joins - Combining Tables
INNER JOIN - Common Records
-- Get users and their posts
SELECT users.name, posts.title, posts.created_at
FROM users
INNER JOIN posts ON users.id = posts.user_id;
-- Aliases
SELECT u.name, p.title, p.created_at
FROM users u
INNER JOIN posts p ON u.id = p.user_id;
-- Multiple joins
SELECT u.name, p.title, c.body
FROM users u
INNER JOIN posts p ON u.id = p.user_id
INNER JOIN comments c ON p.id = c.post_id;
-- Join with WHERE
SELECT u.name, p.title
FROM users u
INNER JOIN posts p ON u.id = p.user_id
WHERE p.published = true;
LEFT JOIN - All Left Records
-- All users, even without posts
SELECT u.name, p.title
FROM users u
LEFT JOIN posts p ON u.id = p.user_id;
-- Count posts per user (including users with no posts)
SELECT u.name, COUNT(p.id) as post_count
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
GROUP BY u.id, u.name;
-- Find users with no posts
SELECT u.name
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
WHERE p.id IS NULL;
RIGHT JOIN, FULL OUTER JOIN
-- RIGHT JOIN - All right table records
SELECT u.name, p.title
FROM users u
RIGHT JOIN posts p ON u.id = p.user_id;
-- FULL OUTER JOIN - All records from both tables
SELECT u.name, p.title
FROM users u
FULL OUTER JOIN posts p ON u.id = p.user_id;
-- CROSS JOIN - Cartesian product (use carefully!)
SELECT u.name, c.name
FROM users u
CROSS JOIN categories c;
Aggregation and Grouping
Aggregate Functions
-- Count records
SELECT COUNT(*) FROM users;
SELECT COUNT(*) FROM users WHERE status = 'active';
-- Sum, Average, Min, Max
SELECT SUM(salary) FROM employees;
SELECT AVG(age) FROM users;
SELECT MIN(created_at) FROM posts;
SELECT MAX(price) FROM products;
-- Multiple aggregations
SELECT
COUNT(*) as total_users,
AVG(age) as average_age,
MIN(age) as youngest,
MAX(age) as oldest
FROM users;
GROUP BY and HAVING
-- Count posts per user
SELECT user_id, COUNT(*) as post_count
FROM posts
GROUP BY user_id;
-- With column names
SELECT u.name, COUNT(p.id) as post_count
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
GROUP BY u.id, u.name;
-- HAVING - Filter aggregations
SELECT user_id, COUNT(*) as post_count
FROM posts
GROUP BY user_id
HAVING COUNT(*) > 5;
-- Complex aggregation
SELECT
country,
COUNT(*) as user_count,
AVG(age) as average_age
FROM users
WHERE status = 'active'
GROUP BY country
HAVING COUNT(*) > 10
ORDER BY user_count DESC;
Subqueries and Common Table Expressions
Subqueries
-- Subquery in WHERE
SELECT * FROM users WHERE id IN (
SELECT user_id FROM posts WHERE published = true
);
-- Subquery with alias
SELECT u.name, user_posts.post_count
FROM users u
JOIN (
SELECT user_id, COUNT(*) as post_count
FROM posts
GROUP BY user_id
) user_posts ON u.id = user_posts.user_id;
-- Correlated subquery
SELECT u.name, (
SELECT COUNT(*) FROM posts WHERE user_id = u.id
) as post_count
FROM users u;
Common Table Expressions (CTE)
-- WITH clause for cleaner queries
WITH active_users AS (
SELECT id, name FROM users WHERE status = 'active'
)
SELECT au.name, COUNT(p.id) as post_count
FROM active_users au
LEFT JOIN posts p ON au.id = p.user_id
GROUP BY au.id, au.name;
-- Multiple CTEs
WITH user_posts AS (
SELECT user_id, COUNT(*) as post_count
FROM posts
GROUP BY user_id
),
active_users AS (
SELECT id, name FROM users WHERE status = 'active'
)
SELECT au.name, COALESCE(up.post_count, 0) as posts
FROM active_users au
LEFT JOIN user_posts up ON au.id = up.user_id;
Query Optimization
Indexing
-- Create index on frequently searched columns
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_posts_user_id ON posts(user_id);
-- Composite index for common queries
CREATE INDEX idx_posts_user_published ON posts(user_id, published);
-- View existing indexes
SELECT * FROM pg_indexes WHERE tablename = 'users';
-- Drop unused indexes
DROP INDEX idx_users_email;
EXPLAIN ANALYZE - Understanding Query Performance
-- See query execution plan
EXPLAIN ANALYZE
SELECT u.name, COUNT(p.id) as post_count
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
GROUP BY u.id, u.name;
-- Look for:
-- - Sequential Scans (should use indexes if possible)
-- - High cost values
-- - N+1 query patterns
Common Performance Mistakes
-- BAD - N+1 Query Pattern
-- Get users
SELECT * FROM users;
-- Then for each user, get posts
SELECT * FROM posts WHERE user_id = ?; -- Repeated N times
-- GOOD - Single join
SELECT u.id, u.name, COUNT(p.id) as post_count
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
GROUP BY u.id, u.name;
-- BAD - LIKE with leading wildcard (won't use index)
SELECT * FROM users WHERE name LIKE '%john%';
-- GOOD - LIKE with leading character (uses index)
SELECT * FROM users WHERE name LIKE 'john%';
-- BAD - Type mismatch (forces conversion)
SELECT * FROM users WHERE id = '123'; -- id is integer
-- GOOD - Correct type
SELECT * FROM users WHERE id = 123;
-- BAD - Unnecessary functions (disables index)
SELECT * FROM users WHERE YEAR(created_at) = 2024;
-- GOOD - Range query (uses index)
SELECT * FROM users WHERE created_at >= '2024-01-01' AND created_at < '2025-01-01';
Window Functions
-- Row number
SELECT
id,
name,
salary,
ROW_NUMBER() OVER (ORDER BY salary DESC) as rank
FROM employees;
-- Partition (GROUP BY alternative)
SELECT
id,
name,
department,
salary,
AVG(salary) OVER (PARTITION BY department) as dept_avg_salary
FROM employees;
-- Lag/Lead
SELECT
id,
amount,
LAG(amount) OVER (ORDER BY created_at) as previous_amount,
LEAD(amount) OVER (ORDER BY created_at) as next_amount
FROM transactions;
-- Running total
SELECT
id,
amount,
SUM(amount) OVER (ORDER BY created_at) as running_total
FROM transactions;
Conclusion
SQL optimization principles:
- Use EXPLAIN ANALYZE to understand queries
- Create indices on frequently searched columns
- Join tables instead of running multiple queries
- Use GROUP BY and HAVING for aggregations
- Avoid leading wildcards in LIKE patterns
- Always use WHERE clauses to limit result sets
- Test queries with realistic data volumes