爬取七麦(Qimai)网应用下载量数据
通过python写脚本抓取七麦数据上Android应用在各大应用平台的下载量。
获取js代码
网络上有很多方法,不展开,最终获取信息如下:
function C(a, n) {
a = a["split"]("");
for (var t = a["length"], e = n["length"], r = "charCodeAt", i = 0; i < t; i++) a[i] = m(a[i][r](0) ^ n[(i + 10) % e][r](0));
return a["join"]("")
}
function m(n) {
var t = "fromCharCode";
return String[t](n)
}
//上面是分析f(k)函数
//下面是f(e)函数
//函数v(n)就是f(e)函数的结果,m函数和上面的函数一样,只需要分析其中的n_fun函数的实现
function v(n) {
return n_fun(encodeURIComponent(n)["replace"](/%([0-9A-F]{2})/g,
function (a, n) {
return m("0x" + n)
}))
}
function n_fun(t) {
var n;
n = e_from(t.toString(), "binary") ;
return q_fromByteArray(n) // 这一处的代码相当于 n.toString("base64")
}
function e_from(t_str, b) {
var r = t_str.length;
t = new Uint8Array(r);
var i = t_write(t, t_str, b, r);
return t
}
function t_write(t, e, b, r) {
return K(W(e), t, 0, r)
}
function K(t, e, n, r) {
for (var j = 0; j < r && !(j + n >= e.length || j >= t.length); ++j) e[j + n] = t[j];
return j
}
function W(t) {
for (var e = [], n = 0; n < t.length; ++n) e.push(255 & t.charCodeAt(n));
return e
}
l = "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,0,1,2,3,4,5,6,7,8,9,+,/"
l = l.split(",")
function q_fromByteArray(t) {
for (var e, n = t.length,
r = n % 3,
i = "",
o = [], a = 16383, u = 0, c = n - r; u < c; u += a) o.push(s(t, u, u + a > c ? c : u + a));
return 1 === r ? (e = t[n - 1], i += l[e >> 2], i += l[e << 4 & 63], i += "==") : 2 === r && (e = (t[n - 2] << 8) + t[n - 1], i += l[e >> 10], i += l[e >> 4 & 63], i += l[e << 2 & 63], i += "="),
o.push(i),
o.join("")
}
function s(t, e, n) {
for (var r, i = [], o = e; o < n; o += 3) r = (t[o] << 16 & 16711680) + (t[o + 1] << 8 & 65280) + (255 & t[o + 2]),
i.push(a(r));
return i.join("")
}
function a(t) {
return l[t >> 18 & 63] + l[t >> 12 & 63] + l[t >> 6 & 63] + l[63 & t]
}
function get0analysis(synct, params) {
var g = new Date() - 1000 * synct;
var e = new Date() - g - 1515125653845;
var analy = [];
var palist = [];
for (var key in params) {
palist.push(params[key])
}
var mm = palist["sort"]()["join"]("");
var mmm = v(mm); //参数mm先执行f(e)函数
var m_str1 = mmm + '@#/rank/indexPlus/brand_id/1@#57313212470@#1';
var m_str0 = mmm + '@#/rank/indexPlus/brand_id/0@#57313212470@#0';
var m_str2 = mmm + '@#/rank/indexPlus/brand_id/2@#57313212470@#2';
var b_str = "00000008d78d46a";
var r2 = v(C(m_str2, b_str));
var r0 = v(C(m_str0, b_str));
var r1 = v(C(m_str1, b_str)) ;
analy.push(r0, r1, r2);
return analy
}
python下载脚本
# -*- coding:utf-8 -*-
import requests, execjs,json
import random
import time, os
import pandas as pd
user_agent = [
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
headers = {
"User-Agent": random.choice(user_agent),
'Origin': 'https://www.qimai.cn',
'Referer': 'https://www.qimai.cn/rank',
"Accept": "application/json,text/plain,*/*",
}
params = {'brand': 'all',
'country': 'cn',
'device': 'iphone',
'genre': '5000',
'date': '2020-8-31',
'page': 2
}
TRY_TIMES = 10
def getAnalysis():
resp = requests.get('https://www.qimai.cn/rank', headers=headers, verify=False)
t = resp.cookies.get_dict()
synct = t.get('synct')#时间
print("current time is : " + synct)
ana = getanaly(synct)
return ana
def getanaly(synct):
js_path = "qimai.js"
try:
new_pwd = "dummy"
with open(js_path, 'r',encoding='utf8') as f:
js_content = f.read()
ctx = execjs.compile(js_content)
new_pwd = ctx.call("get0analysis", synct, params)
except Exception as ex:
print("Exception when run js file....., will return dummy pwd")
finally:
print("getanaly result is: ", new_pwd)
return new_pwd
def check_ana_valid(ana):
params['analysis'] = ana[0]
print("params['analysis'] = " + params['analysis'] )
# params['analysis'] = ana
# print("params['analysis'] = " + params['analysis'])
url = 'https://api.qimai.cn/userRequest/index?'
# print(url)
res = requests.get(url=url,headers=headers,params=params)
# print("check_ana_valid res.text: ", res.text)
try:
resjson = json.loads(res.text)
print("check ana result: " , resjson)
return(resjson)
except Exception as ex:
return""
def check_specifykey_specifymarket_exist(ana, market, searchkey):
params['analysis'] = ana[1]
params['market'] = market #'6'
params['search'] = searchkey # '每团‘
url = 'https://api.qimai.cn/search/checkHasBundleId?'
print(url)
res = requests.get(url=url,headers=headers,params=params)
print("check_specifykey_specifymarket_exist res.txt: "+ res.text)
try:
resjson = json.loads(res.text)
print(resjson)
return (resjson)
except Exception as ex:
return ""
# 查询pagenum页信息
def search_specifykey_specifymarket(ana, market, searchkey, outfilename, keys, pagenum):
params['analysis'] = ana[1]
params['market'] = market #'6'
params['search'] = searchkey # '每团‘
for i in range(1,pagenum):
params['page'] = i
url = 'https://api.qimai.cn/search/android?'
print(url)
print("params: %s" % str(params))
res = requests.get(url=url,headers=headers,params=params)
print("search_specifykey_specifymarket res.txt: "+ res.text)
try:
resjson = json.loads(res.text)
print(resjson)
if str(resjson).strip() == '' or str(resjson).strip().find("10602") != -1:
ana = getAna(TRY_TIMES)
search_specifykey_specifymarket(ana, market, searchkey, outfilename, keys, pagenum)
else:
if not resjson.get('appList') is None:
print(resjson.get('appList')[0])
save_json_to_file(outfilename, resjson, keys)
except Exception as ex:
return ""
# save json data to file
# keys is filter, if keys value exist, don't save
def save_json_to_file(outfilename, jsondata, keys):
with open(outfilename, 'a+', encoding='UTF-8') as f:
f.seek(0)
lines = f.read().splitlines()
EXIST = False
newline = str(jsondata).strip()
try:
newlinedict = eval(newline)
except Exception as ex:
newlinedict = {}
print("newline is :" + newline)
if len(lines) == 0:
f.writelines(newline + '\n')
else:
if not EXIST:
for line in lines:
try:
linedict = eval(line)
except Exception as ex:
continue
for key in keys:
cont = linedict.get(key)
cont_newline = newlinedict.get(key)
if not cont_newline == cont:
EXIST = False
break
EXIST = True
if not EXIST:
f.writelines(newline + '\n')
def download(searchkey, markets, capture_page_numbers):
ana = getAna(TRY_TIMES)
# 共10个android应用市场
# markets = {'360': '1', '百度': '2', '应用宝': '3', '小米': '4', '豌豆荚': '5', '华为': '6', '魅族': '7', 'VIVO': '8', 'OPPO': '9',
# 'gogoleplay': '10'} #共10个android应用市场
for market_name in markets.keys():
market_id = markets.get(market_name)
# jsondata2 = check_specifykey_specifymarket_exist(ana, market, searchkey)
# save_json_to_file("test.txt", jsondata2, ["code", "msg"])
outfilename = searchkey + "_market_" + market_name + ".txt"
search_specifykey_specifymarket(ana, market_id, searchkey, outfilename, [], capture_page_numbers)
def getAna(trytimes):
ana = getAnalysis()
jsondata1 = check_ana_valid(ana)
# newline = str(jsondata1).strip()
while str(jsondata1).strip() == '' or str(jsondata1).strip().find("10602") != -1:
if str(jsondata1).strip().find('Access Error') != -1:
print("Current try time is No.: ", str(TRY_TIMES - trytimes), ". Please wait 10 secs")
time.sleep(10)
elif str(jsondata1).strip().find('请半小时后重试') != -1:
print("Current try time is No.: ", str(TRY_TIMES - trytimes), ". Please wait 30 mins")
time.sleep(1850)
ana = getAnalysis()
jsondata1 = check_ana_valid(ana)
trytimes -= 1
if trytimes == 0:
break
save_json_to_file("check_ana_valid.txt", jsondata1, ["code", "msg"])
return ana
def json_to_excel(text):
df = pd.DataFrame() # 最后转换得到的结果
df1 = pd.DataFrame([text])
df = df.append(df1)
df.to_excel('data.xlsx', sheet_name='Data', startcol=0, index=False)
def json_file_cont_to_excel_special(inputdata, outputfile):
print("will generate {outputfile} base on {inputdata}".format(outputfile=outputfile,inputdata=inputdata))
data = [] # 用于存储每一行的数据
with open(inputdata, 'r', encoding='UTF-8') as fr:
for line in fr:
data.append(line)
df = pd.DataFrame() # 最后转换得到的结果
for line in data:
#每行的数据转为json格式,再转为dataframe格式
line_json = eval(line)
applist_cont_list = line_json.get('appList')
for i in applist_cont_list:
info = {}
if 'appInfo' in i.keys():
info.update(i.get('appInfo'))
if 'company' in i.keys():
info.update(i.get('company'))
if 'rankInfo' in i.keys():
info.update(i.get('rankInfo'))
df1 = pd.DataFrame(info,index=[0])
# df1 = pd.DataFrame.from_dict(info,orient='index').T
df = df.append(df1)
sheet_name = inputdata[:inputdata.find('.')]
df.to_excel(outputfile, sheet_name=sheet_name, startcol=0, index=False)
print("generated {outputfile} base on {inputdata} finish".format(outputfile =outputfile,inputdata=inputdata))
def convert_result_to_one_excel(excelfilename):
list_file = os.listdir('./')
write = pd.ExcelWriter(excelfilename)
for contentfile in list_file:
name = os.path.splitext(contentfile)[0]
suffix = os.path.splitext(contentfile)[1]
if suffix == '.txt' and name.find('_market') != -1:
json_file_cont_to_excel_special(contentfile, write)
write.save()
def convert_result_to_multi_excels():
list_file = os.listdir('./')
for contentfile in list_file:
name = os.path.splitext(contentfile)[0]
suffix = os.path.splitext(contentfile)[1]
if suffix == '.txt' and name.find('_market') != -1:
output_excel_name = name + '.xls'
json_file_cont_to_excel_special(contentfile, output_excel_name)
if __name__ == "__main__":
searchkeys = ['拜佛', '佛教']
markets = {'360': '1', '百度': '2', '应用宝': '3', '小米': '4', '豌豆荚': '5', '华为': '6', '魅族': '7', 'VIVO': '8', 'OPPO': '9',
'gogoleplay': '10'} # 共10个android应用市场
capture_page_numbers = 5
for searchkey in searchkeys:
download(searchkey, markets, capture_page_numbers)
out_excel_filename = "android.xls"
convert_result_to_one_excel(out_excel_filename)
# convert_result_to_multi_excels()
运行日志信息及最终结果展示
日志:
结果文件
txt结果内容示例
{'code': 10000, 'msg': '成功', 'isActualAndroid': 0, 'wechatList': [], 'isSearch': True, 'totalNum': '20', 'maxPage': 1, 'offlineAppInfo': False, 'appList': [{'appInfo': {'appId': '164404', 'appName': '拜佛', 'icon': 'https://static-cdn.qimai.cn/pic/view/type/android/source/aHR0cDovL3AyLnFoaW1nLmNvbS90MDE5ZjU2NTY3YWNkZmZjMmFmLndlYnA=', 'publisher': '深圳市功德文化有限公司', 'comment_score': 1, 'comment_count': 260, 'version_time': '2020-06-23', 'app_download_num': '196454'}, 'genre': '常用工具', 'isMyApp': 0, 'company': {'id': '11863', 'name': '深圳市聚英杰科技有限公司'}, 'rankInfo': {'ranking': 0, 'change': 0, 'genre': '游戏'}}, {'appInfo': {'appId': '132294', 'appName': '怀恩菩提心', 'icon': 'https://static-cdn.qimai.cn/pic/view/type/android/source/aHR0cDovL3AyLnFoaW1nLmNvbS90MDE0NzVkNjRhMGRhM2M2Njc0LndlYnA=', 'publisher': '上海怀恩网络科技有限公司', 'comment_score': 1, 'comment_count': 664, 'version_time': '2019-12-11', 'app_download_num': '362645'}, 'genre': '常用工具', 'isMyApp': 0, 'company': {'id': '49880', 'name': '上海怀恩网络科技有限公司'}, 'rankInfo': {'ranking': 0, 'change': 0, 'genre': '游戏'}}, {'appInfo': {'appId': '132096', 'appName': '修行者', 'icon': 'https://static-cdn.qimai.cn/pic/view/type/android/source/aHR0cDovL3AyLnFoaW1nLmNvbS90MDE2OTUxNmM5YTZjNWVhNzdjLndlYnA=', 'publisher': '广东灵机文化传播有限公司', 'comment_score': 0.9, 'comment_count': 169, 'version_time': '2018-07-19', 'app_download_num': '693930'}, 'genre': '生活服务', 'isMyApp': 0, 'company': {'id': '863', 'name': '广东亿俊计算机系统有限公司'}, 'rankInfo': {'ranking': 0, 'change': 0, 'genre': '游戏'}}]}
excel结果内容示例
说明
- 下载后,会根据关键字、平台把结果存入多个不同的txt文件中
- 方便查看,最后会把txt格式转为excel格式:可以通过调整下面代码实现把结果存放到同一个excel或多个分开的excel
#存到同一个excel文件
convert_result_to_one_excel(out_excel_filename)
#存到多个分开的excel文件
convert_result_to_multi_excels()
- 经常会下载是获取不到信息,多等一段时间即可。同时发现,没有获取到正确的analysis亦可爬取到结果
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!