数据分析编程:SQL,Python or SPL?

server/2024/11/18 17:54:34/

Talk is cheap. Let’s show the code

1. 计算用户会话次数

用户行为数据表

useridaction_typeaction_time
U1059login2023-12-01 18:00:10
U1092login2023-12-01 18:00:17
U1069login2023-12-01 18:00:22

10 分钟没有任何动作或退出后 5 分钟没有登录则认为会话结束,计算各用户的会话次数。

SPL

A
1=file(“session_data.csv”).import@tc()
2=A1.group(userid;~.group@i((action_type[-1]==“exit”&&interval@s(action_time[-1],action_time)>300)||(interval@s(action_time[-1],action_time)>600)).len():session_num)

SQL

sql">WITH login_data AS (SELECT userid, action_type, action_time,LAG(action_time) OVER (PARTITION BY userid ORDER BY action_time) AS prev_time,LAG(action_type) OVER (PARTITION BY userid ORDER BY action_time) AS prev_actionFROM session_data)
SELECT userid, COUNT(*) AS session_count
FROM (SELECT userid, action_type, action_time, prev_time, prev_action,CASEWHEN prev_time IS NULL OR (action_time - prev_time) > 600OR (prev_action = 'exit' AND (action_time - prev_time) > 300)THEN 1ELSE 0END AS is_new_sessionFROM login_data)
WHERE is_new_session = 1
GROUP BY userid;

Python

python">login_data = pd.read_csv("session_data.csv")
login_data['action_time'] = pd.to_datetime(login_data['action_time'])
grouped = login_data.groupby("userid")
session_count = {}
for uid, sub_df in grouped:session_count[uid] = 0start_index = 0for i in range(1, len(sub_df)):current = sub_df.iloc[i]last = sub_df.iloc[start_index]last_action = last['action_type']if (current["action_time"] - last["action_time"]).seconds > 600 or \(last_action=="exit" and (current["action_time"] - last["action_time"]).seconds > 300):session_count[uid] += 1start_index = isession_count[uid] += 1
session_cnt = pd.DataFrame(list(session_count.items()), columns=['UID', 'session_count'])

2. 1 分钟内连续得分 3 次的球员

球赛得分表

teamplayerplay_timescore
AA32023-12-31 09:00:092
BB12023-12-31 09:00:243
AA52023-12-31 09:00:572

SPL

A
1=file(“ball_game.csv”).import@tc()
2=A1.group@o(player).select(~.len()>2&&(~.pselect(#>2&&interval@s(play_time[-2],play_time)<60))).(player)

A2:group@o(),@o 选项是按顺序把相同球员分成一组,球员不同开始新的分组,然后筛选其中连续得分次数大于等于 3 次且有任意一个连续三次得分时间间隔小于 60 的分组,最后取出球员。

SQL

sql">WITH numbered_scores AS (SELECT team, player, play_time, score,ROW_NUMBER() OVER (ORDER BY play_time) AS rnFROM ball_game)
SELECT DISTINCT s1.player
FROM numbered_scores s1JOIN numbered_scores s2 ON s1.player = s2.player AND s1.rn = s2.rn - 1JOIN numbered_scores s3 ON s1.player = s3.player AND s1.rn = s3.rn - 2
WHERE (s3.play_time - s1.play_time) < 60 ;

Python

python">df = pd.read_csv("ball_game.csv")
df["play_time"] = pd.to_datetime(df["play_time"])
result_players = []
player = None
start_index = 0
consecutive_scores = 0
for i in range(len(df)-2):current = df.iloc[i]if player != current["player"]:player = current["player"]consecutive_scores = 1else:consecutive_scores += 1last2 = df.iloc[i-2] if i >=2 else Noneif consecutive_scores >= 3 and (current['play_time'] - last2['play_time']).seconds < 60:result_players.append(player)
result_players = list(set(result_players))

3. 每 7 天中连续三天活跃的用户数

用户登录表

iduseridts
14662017-01-07 18:24:55
24582017-01-07 18:25:18
34582017-01-07 18:26:21

SPL

A
1=file(“login_data.csv”).import@tc()
2=periods(date(A1.ts),date(A1.m(-1).ts))
3=A1.group(userid).(~.align(A2,date(ts)).(if(#<7,null,(cnt=~[-6:0].group@i(!~).max(count(~)),if(cnt>=3,1,0)))))
4=msum(A3).~.new(A2(#):dt,int(~):cont3_num).to(7,)

SQL

sql">WITH all_dates AS (SELECT DISTINCT TRUNC(ts) AS login_dateFROM login_data),
user_login_counts AS (SELECT userid, TRUNC(ts) AS login_date, (CASE WHEN COUNT(*)>=1 THEN 1 ELSE 0 END) AS login_countFROM login_dataGROUP BY userid, TRUNC(ts)),
whether_login AS (SELECT u.userid, ad.login_date, NVL(ulc.login_count, 0) AS login_countFROM all_dates adCROSS JOIN (SELECT DISTINCT useridFROM login_data) uLEFT JOIN user_login_counts ulcON u.userid = ulc.useridAND ad.login_date = ulc.login_dateORDER BY u.userid, ad.login_date),
whether_login_rn AS (SELECT userid,login_date,login_count,ROWNUM AS rn FROM whether_login),
whether_eq AS(SELECT userid,login_date,login_count,rn,(CASE WHEN LAG(login_count,1) OVER (ORDER BY rn)= login_count AND login_count =1 AND LAG(userid,1) OVER (ORDER BY rn)=userid THEN 0 ELSE 1 END) AS wether_e FROM whether_login_rn
),
numbered_sequence AS (SELECT userid,login_date,login_count,rn, wether_e,SUM(wether_e) OVER (ORDER BY rn) AS labFROM whether_eq),
consecutive_logins_num AS (SELECT userid,login_date,login_count,rn, wether_e,lab,(SELECT (CASE WHEN max(COUNT(*))<3 THEN 0 ELSE 1 END)FROM numbered_sequence bWHERE b.rn BETWEEN a.rn - 6 AND a.rnAND b.userid=a.useridGROUP BY b. lab) AS cntFROM numbered_sequence a)
SELECT login_date,SUM(cnt) AS cont3_num
FROM consecutive_logins_num
WHERE login_date>=(SELECT MIN(login_date) FROM all_dates)+6
GROUP BY login_date
ORDER BY login_date;

Python

python">df = pd.read_csv("login_data.csv")
df["ts"] = pd.to_datetime(df["ts"]).dt.date
grouped = df.groupby("userid")
aligned_dates = pd.date_range(start=df["ts"].min(), end=df["ts"].max(), freq='D')
user_date_wether_con3days = []
for uid, group in grouped:group = group.drop_duplicates('ts')aligned_group = group.set_index("ts").reindex(aligned_dates)consecutive_logins = aligned_group.rolling(window=7)n = 0date_wether_con3days = []for r in consecutive_logins:n += 1if n<7:continueelse:ds = r['userid'].isna().cumsum()cont_login_times = r.groupby(ds).userid.count().max()wether_cont3days = 1 if cont_login_times>=3 else 0date_wether_con3days.append(wether_cont3days)user_date_wether_con3days.append(date_wether_con3days)
arr = np.array(user_date_wether_con3days)
day7_cont3num = np.sum(arr,axis=0)
result = pd.DataFrame({'dt':aligned_dates[6:],'cont3_num':day7_cont3num})

4. 每天新用户的次日留存率

用户登录表

iduseridts
14662017-01-07 18:24:55
24582017-01-07 18:25:18
34582017-01-07 18:26:21

SPL

A
1=file(“login_data.csv”).import@tc()
2=A1.group(userid;fst=date(ts):fst_login,~.(date(ts)).pos(fst+1)>0:wether_sec_login)
3=A2.groups(fst_login+1:dt;count(wether_sec_login)/count(1):ret_rate)

A2:按用户分组,记录首次登录日期并查看第二天是否登录
A3:按第二天登录日期统计次日留存率

SQL

sql">WITH first_login AS (SELECT userid, MIN(TRUNC(ts)) AS first_login_dateFROM login_dataGROUP BY userid),
next_day_login AS (SELECT DISTINCT(fl.userid), fl.first_login_date, TRUNC(ld.ts) AS next_day_login_dateFROM first_login flLEFT JOIN login_data ld ON fl.userid = ld.useridWHERE TRUNC(ld.ts) = fl.first_login_date + 1),
day_new_users AS(SELECT first_login_date,COUNT(*) AS new_user_numFROM first_loginGROUP BY first_login_date),
next_new_users AS(SELECT next_day_login_date, COUNT(*) AS next_user_numFROM next_day_loginGROUP BY next_day_login_date),
all_date AS(SELECT DISTINCT(TRUNC(ts)) AS login_dateFROM login_data)
SELECT all_date.login_date+1 AS dt,dn. new_user_num,nn. next_user_num,(CASE WHEN nn. next_day_login_date IS NULL THEN 0 ELSE nn.next_user_num END)/dn.new_user_num AS ret_rate
FROM all_dateJOIN day_new_users dn ON all_date.login_date=dn.first_login_dateLEFT JOIN next_new_users nn ON dn.first_login_date+1=nn. next_day_login_date
ORDER BY all_date.login_date;

Python

python">df = pd.read_csv("login_data.csv")
df["ts"] = pd.to_datetime(df["ts"]).dt.date
gp = df.groupby('userid')
row = []
for uid,g in gp:fst_dt = g.iloc[0].tssec_dt = fst_dt + pd.Timedelta(days=1)all_dt = g.ts.valueswether_sec_login = sec_dt in all_dtrow.append([uid,fst_dt,sec_dt,wether_sec_login])
user_wether_ret_df = pd.DataFrame(row,columns=['userid','fst_dt','sec_dt','wether_sec_login'])
result = user_wether_ret_df.groupby('sec_dt').apply(lambda x:x['wether_sec_login'].sum()/len(x))

5. 股价高于前后 5 天时当天的涨幅

股价信息表

STOCKIDDATECLOSING
622015-01-058.91
622015-01-068.31
622015-01-077.6

SPL

A
1=file(“STOCK.csv”).import@tc()
2=lth=A1.len(),A1.pselect@a(#>4&&#<=A1.len()-4&&CLOSING>max(CLOSING[-4:-1])&&CLOSING>max(CLOSING[1:4]))
3=A1.calc(A2,CLOSING/CLOSING[-1]-1)

SQL

sql">SELECT closing/closing_pre-1 AS raise
FROM(SELECT dt, closing, ROWNUM AS rn,MAX(closing) OVER (ORDER BY dt ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS max_pre,MAX(closing) OVER (ORDER BY dt ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING) AS max_suf,LAG(closing,1) OVER (ORDER BY dt) AS closing_preFROM stock)
WHERE rn>5 AND rn<=(select count(*) FROM stock)-5AND CLOSING>max_pre  AND CLOSING>max_suf;

Python

python">stock_price_df = pd.read_csv('STOCK.csv')
price_increase_list = []
for i in range(5, len(stock_price_df)-5):if stock_price_df['CLOSING'][i] > max(stock_price_df['CLOSING'][i-5:i]) and \stock_price_df['CLOSING'][i] > max(stock_price_df['CLOSING'][i+1:i+6]):price_increase = stock_price_df['CLOSING'][i] / stock_price_df['CLOSING'][i-1]-1price_increase_list.append(price_increase)
result = price_increase_list

到底好不好用下载试试就知道了~~免费下载试用


http://www.ppmy.cn/server/142974.html

相关文章

华为欧拉系统使用U盘制作引导安装华为欧拉操作系统

今天记录一下通过U盘来安装华为欧拉操作系统 华为欧拉操作系统是国产的一个类似于Centos的Linus系统 具体实现操作步骤&#xff1a; 先在官网下载欧拉系统镜像点击跳转到下载 准备好一个大于16g的U盘 &#xff0c;用于制作U盘启动 下载一个引导程序制作工具&#xff0c;我使用…

华为USG5500防火墙配置NAT

实验要求&#xff1a; 1.按照拓扑图部署网络环境&#xff0c;使用USG5500防火墙&#xff0c;将防火墙接口加入相应的区域&#xff0c;添加区域访问规则使内网trust区域可以访问DMZ区域的web服务器和untrust区域的web服务器。 2.在防火墙上配置easy-ip&#xff0c;使trust区域…

2023年MathorCup数学建模A题量子计算机在信用评分卡组合优化中的应用解题全过程文档加程序

2023年第十三届MathorCup高校数学建模挑战赛 A题 量子计算机在信用评分卡组合优化中的应用 原题再现&#xff1a; 在银行信用卡或相关的贷款等业务中&#xff0c;对客户授信之前&#xff0c;需要先通过各种审核规则对客户的信用等级进行评定&#xff0c;通过评定后的客户才能…

周末总结(2024/11/16)

工作 人际关系核心实践&#xff1a; 要学会随时回应别人的善意&#xff0c;执行时间控制在5分钟以内 坚持每天早会打招呼 遇到接不住的话题时拉低自己&#xff0c;抬高别人(无阴阳气息) 朋友圈点赞控制在5min以内&#xff0c;职场社交不要放在5min以外 职场的人际关系在面对利…

IDEA优雅debug

目录 引言一、断点分类&#x1f384;1.1 行断点1.2 方法断点1.3 属性断点1.4 异常断点1.5 条件断点1.6 源断点1.7 多线程断点1.8 Stream断点 二、调试动作✨三、Debug高级技巧&#x1f389;3.1 watch3.2 设置变量3.3 异常抛出3.4 监控JVM堆大小3.5 数组过滤和筛选 引言 使用ID…

Brave127编译指南 Windows篇:获取源码(六)

1. 概述 在Brave浏览器的编译过程中&#xff0c;获取源码是至关重要的第一步。源码包含了Brave浏览器的所有核心功能、特性和组件代码&#xff0c;是深入理解和定制Brave的基础。通过获取和管理源码&#xff0c;开发者能够深入探索Brave的实现细节&#xff0c;为后续的二次开发…

模拟实现STL中的list

目录 1.设计list的结点 2.设计list的迭代器 3.list类的设计总览 4.list类的迭代器操作 5.list类的四个特殊的默认成员函数 无参的默认构造函数 拷贝构造函数 赋值运算符重载函数 析构函数 6.list类的插入操作 7.list类的删除操作 8.list.hpp源代码 1.设计list的结点…

Linux解决 -bash: nc: command not found-bash: nc: 未找到命令

[rootbigdata01 ~]# nc -lk 9999 -bash: nc: command not found&#xff08;-bash: nc: 未找到命令&#xff09; 没有命令安装一下即可 yum install -y nc [rootbigdata01 ~]# yum install -y nc 已加载插件&#xff1a;fastestmirror Determining fastest mirrors * base: mi…