-
Notifications
You must be signed in to change notification settings - Fork 72
/
Hadoop(Hive) - NYC Yellow Taxi Case Study.txt
91 lines (58 loc) · 2.36 KB
/
Hadoop(Hive) - NYC Yellow Taxi Case Study.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
NYC Yellow Taxi Case Study using HiveQL
Tasks:
1. Create a table named taxidata.
Create database hive;
Use hive;
CREATE TABLE IF NOT EXISTS taxidata
(vendor_id string, pickup_datetime string, dropoff_datetime string, passenger_count
int, trip_distance DECIMAL(9,6), pickup_longitude DECIMAL(9,6), pickup_latitude
DECIMAL(9,6), rate_code int, store_and_fwd_flag string, dropoff_longitude
DECIMAL(9,6), dropoff_latitude DECIMAL(9,6), payment_type string, fare_amount
DECIMAL(9,6), extra DECIMAL(9,6), mta_tax DECIMAL(9,6), tip_amount
DECIMAL(9,6), tolls_amount DECIMAL(9,6), total_amount DECIMAL(9,6),
trip_time_in_secs int )
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED as TEXTFILE TBLPROPERTIES ("skip.header.line.count"="1");
2. Load data from the csv file. (yellow_tripdata_2015-01-
06.csv)
LOAD DATA INPATH '/user/hive OVERWRITE INTO TABLE taxidata;
3. Run some basic queries to check the data is loaded
properly.
Select * from taxidata;
4. Run the queries required to answer the following
questions.
Problem Statement:
* What is the total Number of trips (equal to number of rows)?
Select count (*) from taxidata;
* What is the total revenue generated by all the trips?
Select sum(total_amount) as total_revenue from taxidata;
* What fraction of the total is paid for tolls?
Select sum(tolls_amount)/sum(total_amount) as toll_pct from taxidata;
* What fraction of it is driver tips?
Select sum(tip_amount)/sum(total_amount) as tip_pct from taxidata;
* What is the average trip amount?
Select avg(total_amount) as avg_tripamount from taxidata;
* For each payment type, display the following details:
i. Average fare generated
ii. Average tip
iii. Average tax
select payment_type,
avg(fare_amount) as average_fare,
avg(tip_amount) as average_tip,
avg(mta_tax) as average_tax,
from taxidata
group by payment_type;
* On an average which hour of the day generates the highest revenue?
select h24 as hour,
avg(total_amount) as avg_revenue
from (select hour(pickup_datetime) as h24,
total_amount
from taxidata) ff
group by h24
order by avg_revenue desc;
* What is the average distance of the trips?
select
avg(trip_distance) as avg_distance
from trips4;
* How many different payment types are used?
select distinct payment_type from taxidata;