Hive Exercises
Hive Exercises
* hive exercises
*
* part of a longer hadoop video course at http://bit.ly/learn-hadoop/
*
* @author rICh <rich@quicloud.com>
*/
// our sample "page_view" data looks like this:
1391102617,1234,http://quicloud.com/,http://google.com/,198.211.110.9
1391101111,1234,http://somewhere.com/,http://google.com/,197.211.110.9
1391002637,1234,http://somwhere-else.com/,http://yahoo.com/,196.211.110.9
1391002617,1234,http://quicloud.com/,http://bing.com/,198.211.110.9
// start up hive
hive
// this is an external table
CREATE DATABASE page_view;
// note: if using the AWS-backed 'cloudera manager' stack, change '/user/clouder
a/' to '/user/ubuntu/'
CREATE EXTERNAL TABLE page_view_external(viewTime INT, userid BIGINT,page_url ST
RING, referrer_url STRING,ip STRING) COMMENT 'This is the EXTERNAL page view tab
le' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/cloudera/input
-data/page_view/';
DESC page_view_external;
DESC EXTENDED page_view_external;
//notice tableType (EXTERNAL_TABLE)
// this command returns quickly (probably seconds). why?
// ...if we are not doing any filtering (like where), and returning all rows, no
MR is done
SELECT * FROM page_view_external;
// this command, however, takes several seconds
SELECT * FROM page_view_external WHERE page_url LIKE '%quicloud%';
SELECT DISTINCT(page_url) FROM page_view_external WHERE page_url LIKE '%quicloud
%';
// this is an internal table
CREATE TABLE page_view_internal(viewTime INT, userid BIGINT,page_url STRING, ref
errer_url STRING,ip STRING) COMMENT 'This is the INTERNAL page view table' ROW F
ORMAT DELIMITED FIELDS TERMINATED BY ',';
LOAD DATA LOCAL INPATH '/home/cloudera/code-and-data/data/page_view' INTO TABLE
page_view_internal;
// What happened to our "local" data? In another terminal do:
ls -lat ~/code-and-data/data/page_view/
// still there
// validate that the data made it into hive
SELECT * FROM page_view_internal;
// in another terminal, let's look for where the data lives
hadoop fs -ls /user/hive/warehouse/page_view_internal
// where does our "external" (page_view_external) tables data live? this should
work, right?
hadoop fs -ls /user/hive/warehouse/
// WRONG! This data is EXTERNAL and in the location that we connected up with th
e CREATE TABLE command
hadoop fs -ls /user/cloudera/input-data/page_view/
DESC EXTENDED page_view_internal;