-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Cleaning Project.sql
104 lines (63 loc) · 3.62 KB
/
Data Cleaning Project.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
SELECT * FROM layoffs;
CREATE TABLE layoffs_staging LIKE layoffs;
SELECT * FROM layoffs_staging;
INSERT layoffs_staging SELECT * FROM layoffs;
-- Removing duplicates
SELECT *, ROW_NUMBER() OVER(PARTITION BY company, industry, total_laid_off, percentage_laid_off, `date`) AS row_num FROM layoffs_staging;
WITH duplicate_cte AS
(SELECT *, ROW_NUMBER() OVER(PARTITION BY company, location, industry, total_laid_off, percentage_laid_off,
`date`, stage, country, funds_raised_millions) AS row_num FROM layoffs_staging)
SELECT * FROM duplicate_cte WHERE row_num > 1;
SELECT * FROM layoffs_staging WHERE company = 'Casper';
WITH duplicate_cte AS
(SELECT *, ROW_NUMBER() OVER(PARTITION BY company, location, industry, total_laid_off, percentage_laid_off,
`date`, stage, country, funds_raised_millions) AS row_num FROM layoffs_staging)
DELETE FROM duplicate_cte WHERE row_num > 1;
CREATE TABLE `layoffs_staging2` (
`company` text,
`location` text,
`industry` text,
`total_laid_off` int DEFAULT NULL,
`percentage_laid_off` text,
`date` text,
`stage` text,
`country` text,
`funds_raised_millions` int DEFAULT NULL,
`row_num` INT
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
SELECT * FROM layoffs_staging2;
INSERT INTO layoffs_staging2 SELECT *, ROW_NUMBER() OVER(PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country,
funds_raised_millions) AS row_num FROM layoffs_staging;
SELECT * FROM layoffs_staging2 WHERE row_num > 1;
SET SQL_SAFE_UPDATES=0;
DELETE FROM layoffs_staging2 WHERE layoffs_staging2.row_num > 1;
SET SQL_SAFE_UPDATES=1;
-- Standardizing data
SELECT company, TRIM(company) FROM layoffs_staging2;
UPDATE layoffs_staging2 SET company = TRIM(company);
SELECT DISTINCT industry FROM layoffs_staging2 ORDER BY 1;
SELECT * FROM layoffs_staging2 WHERE industry LIKE 'Crypto%';
UPDATE layoffs_staging2 SET industry = 'Crypto' WHERE industry LIKE 'Crypto%';
SELECT DISTINCT location FROM layoffs_staging2 ORDER BY 1;
SELECT DISTINCT country FROM layoffs_staging2 ORDER BY 1;
SELECT * FROM layoffs_staging2 WHERE country LIKE 'United States%';
SELECT DISTINCT country, TRIM(TRAILING '.' FROM country) FROM layoffs_staging2 ORDER BY 1;
UPDATE layoffs_staging2 SET country = TRIM(TRAILING '.' FROM country) WHERE country LIKE 'United States%';
SELECT `date`, STR_TO_DATE(`date`, '%m/%d/%Y') FROM layoffs_staging2;
UPDATE layoffs_staging2 SET `date` = STR_TO_DATE(`date`, '%m/%d/%Y');
ALTER TABLE layoffs_staging2 MODIFY COLUMN `date` DATE;
SELECT * FROM layoffs_staging2;
-- Null or Blank values
SELECT * FROM layoffs_staging2 WHERE industry IS NULL OR industry = '';
SELECT * FROM layoffs_staging2 WHERE company = 'Airbnb';
SELECT * FROM layoffs_staging2 t1 JOIN layoffs_staging2 t2 ON t1.company = t2.company WHERE (t1.industry IS NULL OR t1.industry = '') AND t2.industry IS NOT NULL;
SELECT t1.industry, t2.industry FROM layoffs_staging2 t1 JOIN layoffs_staging2 t2 ON t1.company = t2.company
WHERE (t1.industry IS NULL OR t1.industry = '') AND t2.industry IS NOT NULL;
UPDATE layoffs_staging2 SET industry = NULL WHERE industry = '';
UPDATE layoffs_staging2 t1 JOIN layoffs_staging2 t2 ON t1.company = t2.company SET t1.industry = t2.industry WHERE t1.industry IS NULL AND t2.industry IS NOT NULL;
SELECT * FROM layoffs_staging2 WHERE company LIKE 'Bally%';
-- Removing unnecessary rows and columns
SELECT * FROM layoffs_staging2 WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
DELETE FROM layoffs_staging2 WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
ALTER TABLE layoffs_staging2 DROP COLUMN row_num;
SELECT * FROM layoffs_staging2;