-- note that the below was cobbled together from my command history --I haven't yet re-run it, so, use at your own risk -- start with the following table containing some of the data scraped from ECFS -- (I first scraped to mongo and then used https://github.com/stripe/mosql to load into postgresql) /* Table "public.proc_17_108" Column | Type | Modifiers ----------------------------+-----------------------+----------- id | character varying(40) | not null date_submission | character varying(40) | contact_email | text | confirmation_number | character varying(40) | text_data | text | index | character varying(40) | filers | text | addressentity | text | internationaladdressentity | text | Indexes: "proc_17_108_pkey" PRIMARY KEY, btree (id) */ -- preserve original SELECT * INTO proc_17_108_copy FROM proc_17_108; -- add text hash and pick out unique entries ALTER TABLE proc_17_108_copy ADD COLUMN text_data_hash uuid; UPDATE proc_17_108_copy SET text_data_hash = md5(text_data)::uuid; SELECT DISTINCT ON (text_data_hash) text_data_hash, text_data, docid INTO proc_17_108_copy_uniques FROM proc_17_108_copy; --add dupe_count and count the number of dupes ALTER TABLE proc_17_108_copy_uniques ADD COLUMN dupe_count INT DEFAULT NULL; UPDATE proc_17_108_copy_uniques SET dupe_count = subquery.dupecount FROM ( SELECT text_data_hash as thash, COUNT(text_data_hash) as dupecount FROM proc_17_108_copy GROUP BY text_data_hash ) AS subquery WHERE proc_17_108_copy_uniques.text_data_hash = subquery.thash;