SQL stuffs.
Spark SQL guide is available from Spark documentation.
Here are some good external references for learning and practicing SQL, e.g., websites and online coding exercises.
The following LeetCode problems are order by their difficulties in which hard problems are discussed first and medium problems are then presented. However, I will not list all easy problems :laughing:
between...and
.round
for rounding.count(*)
followed by group by
for line counting.case when <> then <> else <> end
to compute and avoid the null
casesselect T.Request_at,
round(sum(case when T.Status like 'can%' then 1 else 0 end)/count(*),2) as ratio
from Trips as T join Users as U on T.Client_Id = U.Users_Id
where U.banned = 'No' and U.Role = 'client' and T.Request_at between '2013-10-01' and '2013-10-03'
group by T.Request_at
count(distince [colname])
.count(distinct [colname])...group by...
.count([colname])...group by...
.select Department.Name,tmp.name,tmp.s1
from
(select e1.Departmentid id, e1.Name name, e1.Salary s1, count(distinct e2.Salary) rk
from Employee e1 join Employee e2 on e1.DepartmentId = e2.DepartmentId
where e1.Salary<=e2.Salary
group by e1.Name) tmp
join Department on tmp.id = Department.Id
where tmp.rk<=3
set n=n-1;
.limit 1 offset N
.CREATE FUNCTION getNthHighestSalary(N INT) RETURNS INT
BEGIN
set n=N-1;
RETURN (
select tmp.s1
from
(select distinct e1.Salary s1, count( e2.Salary) rk
from Employee e1 join Employee e2
where e1.Salary<=e2.Salary
group by e1.Id)tmp
order by tmp.rk
limit 1 offset n
);
END
distinct
in count()
.distinct
in count()
.group by
and order by
.select s1.Score, count(distinct s2.Score) as rk
from Scores s1 join Scores s2
where s1.Score<=s2.Score
group by s1.Id
order by rk
select distinct l1.Num
from Logs l1 join Logs l2 on l1.Id+1 = l2.Id
join Logs l3 on l2.Id+1 = l3.Id
where l1.Num = l2.Num and l2.Num=l3.Num
group by
the corresponding column of the first table.select d.Name, tmp.Name, tmp.Salary
from Department d join
(select e1.Name, e1.Salary, count(distinct e2.Salary) as ct, e1.DepartmentId
from Employee e1 join Employee e2
where e1.DepartmentId = e2.DepartmentId and e1.Salary<=e2.Salary
group by e1.Id) tmp
on tmp.DepartmentId = d.Id
where tmp.ct = 1
where
clause.min()
or max()
functions followed by group by
delete from Person
where Person.Id not in
(select * from (select min(Id) from Person group by Email)tmp)