-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first code commit: sql to remove and count duplicates; EDA notebook
- Loading branch information
Showing
3 changed files
with
1,989 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
*.txt | ||
*.csv | ||
*.pkl | ||
__pycache__/ | ||
.ipynb_checkpoints/ | ||
.idea/ | ||
cleanup/ | ||
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
-- note that the below was cobbled together from my command history | ||
--I haven't yet re-run it, so, use at your own risk | ||
|
||
-- start with the following table containing some of the data scraped from ECFS | ||
-- (I first scraped to mongo and then used https://github.com/stripe/mosql to load into postgresql) | ||
|
||
/* Table "public.proc_17_108" | ||
Column | Type | Modifiers | ||
----------------------------+-----------------------+----------- | ||
id | character varying(40) | not null | ||
date_submission | character varying(40) | | ||
contact_email | text | | ||
confirmation_number | character varying(40) | | ||
text_data | text | | ||
index | character varying(40) | | ||
filers | text | | ||
addressentity | text | | ||
internationaladdressentity | text | | ||
Indexes: | ||
"proc_17_108_pkey" PRIMARY KEY, btree (id) | ||
*/ | ||
|
||
-- preserve original | ||
SELECT * INTO proc_17_108_copy FROM proc_17_108; | ||
|
||
-- add text hash and pick out unique entries | ||
ALTER TABLE proc_17_108_copy ADD COLUMN text_data_hash uuid; | ||
UPDATE proc_17_108_copy SET text_data_hash = md5(text_data)::uuid; | ||
SELECT DISTINCT ON (text_data_hash) text_data_hash, text_data, docid INTO proc_17_108_copy_uniques FROM proc_17_108_copy; | ||
|
||
--add dupe_count and count the number of dupes | ||
ALTER TABLE proc_17_108_copy_uniques ADD COLUMN dupe_count INT DEFAULT NULL; | ||
UPDATE proc_17_108_copy_uniques | ||
SET dupe_count = subquery.dupecount | ||
FROM ( | ||
SELECT text_data_hash as thash, COUNT(text_data_hash) as dupecount | ||
FROM proc_17_108_copy GROUP BY text_data_hash | ||
) AS subquery | ||
WHERE proc_17_108_copy_uniques.text_data_hash = subquery.thash; |
Oops, something went wrong.