-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlayoff.sql
More file actions
135 lines (102 loc) · 3.7 KB
/
Copy pathlayoff.sql
File metadata and controls
135 lines (102 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
use layoff_database;
SHOW TABLES;
DESCRIBE layoffs;
SELECT * FROM layoffs;
SELECT COUNT(*) FROM layoffs;
-- ******************REMOVING DU8PLICATES******************************
CREATE TABLE layoffs_distinct AS
SELECT DISTINCT * FROM layoffs;
-- checking for duplicates
SELECT COUNT(*), "distinct data" AS "desc" FROM layoffs_distinct
UNION ALL
SELECT COUNT(*), "duplicate data" AS "desc" FROM layoffs;
-- executing deduplication
DROP TABLE layoffs ;
ALTER TABLE layoffs_distinct RENAME TO layoffs;
-- ******************* STANDARDIZING DATA *****************************
-- scanning data integrity issues in "company" column
SELECT DISTINCT company
FROM layoffs
ORDER BY company;
-- trimming white spaces
SELECT DISTINCT company, TRIM(company)
FROM layoffs
ORDER BY company;
UPDATE layoffs
SET company=TRIM(company);
-- fixing value inconsistencies in "location" column
SELECT DISTINCT location FROM layoffs
ORDER BY location;
UPDATE layoffs
SET location = 'Düsseldorf'
WHERE location IN ('Dusseldorf', 'Düsseldorf');
UPDATE layoffs
SET location = 'Malmö'
WHERE location = 'Malmö';
UPDATE layoffs
SET location = 'Florianópolis'
WHERE location = 'Florianópolis';
-- fixing value inconsistencies in "industry" column
SELECT DISTINCT industry FROM layoffs ORDER BY industry;
SELECT * FROM layoffs
WHERE industry LIKE "Crypto%";
UPDATE layoffs
SET industry="Crypto"
WHERE industry LIKE "Crypto%";
SELECT DISTINCT country FROM layoffs
ORDER BY country;
-- fixing datatype of "date" column from TEXT to DATE
SELECT DISTINCT date FROM layoffs
ORDER BY date;
SELECT STR_TO_DATE(date, '%c/%e/%Y') FROM layoffs;
UPDATE layoffs
SET date= STR_TO_DATE(date, '%c/%e/%Y');
ALTER TABLE layoffs
MODIFY COLUMN date DATE;
SELECT DISTINCT date FROM layoffs ORDER BY date;
-- fixing value inconsistencies in "country" column
SELECT DISTINCT country FROM layoffs
ORDER BY country;
SELECT * FROM layoffs
WHERE country like "United States%";
UPDATE layoffs
SET country="United States"
WHERE country LIKE "United States%" ;
-- **************************HANDLING MISSING VAUES*************************
CREATE VIEW layoffs_null_values_count AS
SELECT "company" AS col_name, COUNT(*)-COUNT(company) AS null_count FROM layoffs
UNION ALL
SELECT "location" AS col_name, COUNT(*)-COUNT(location) AS null_count FROM layoffs
UNION ALL
SELECT "industry" AS col_name, COUNT(*)-COUNT(industry) AS null_count FROM layoffs
UNION ALL
SELECT "total_laid_off" AS col_name, COUNT(*)-COUNT(total_laid_off) AS null_count FROM layoffs
UNION ALL
SELECT "percentage_laid_off" AS col_name, COUNT(*)-COUNT(percentage_laid_off) AS null_count FROM layoffs
UNION ALL
SELECT "date" AS col_name, COUNT(*)-COUNT(date) AS null_count FROM layoffs
UNION ALL
SELECT "stage" AS col_name, COUNT(*)-COUNT(stage) AS null_count FROM layoffs
UNION ALL
SELECT "country" AS col_name, COUNT(*)-COUNT(country) AS null_count FROM layoffs
UNION ALL
SELECT "funds_raised_millions" AS col_name, COUNT(*)-COUNT(funds_raised_millions) AS null_count FROM layoffs;
SELECT* FROM layoffs_null_values_count;
ALTER TABLE layoffs
MODIFY COLUMN percentage_laid_off FLOAT;
-- Impute null values in stage column to "Unknown"
UPDATE layoffs
SET stage="Unknown" WHERE stage IS NULL;
-- impute null values and empty strings in industry column to the industry value in other similar columns
UPDATE layoffs SET industry= NULL
WHERE industry="";
UPDATE layoffs t1
JOIN layoffs t2
ON t1.company=t2.company
AND t1.location=t2.location
SET t1.industry=t2.industry
WHERE t1.industry IS NULL AND t2.industry IS NOT NULL;
-- filtering out rows with null values
DELETE FROM layoffs WHERE industry IS NULL;
DELETE FROM layoffs WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
DELETE FROM layoffs WHERE date IS NULL;