0%

工具笔记

爬百度图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
import re
import os


def get_page_url(url, param):
response = requests.get(url, params=param)
response.encoding = 'utf-8'
return response.text


def parse_page(str):
pattern = re.compile('"middleURL":"(.*?)",') #利用正则匹配图片url
url_list = re.findall(pattern, str)
return url_list


def run(keyword, path, num):
url = "https://image.baidu.com/search/acjson"
i = 0
# size = 0
# if int(num) < 30:
# size = int(num)
# else:
# size = 30
for j in range(0, int(num), 30):
params = {"ipn": "rj", "tn": "resultjson_com", "word": keyword, "pn": str(j)}
html = get_page_url(url, params)
lists = parse_page(html)
print(lists)
for item in lists:
try:
img_data = requests.get(item, timeout=10).content
with open(path + "/" + str(i) + ".jpg", "wb") as f:
f.write(img_data)
f.close()
i = i+1
except requests.exceptions.ConnectionError:
print('can not download')
continue


def make_dir(keyword):
path = '百度图片/'
path = path+keyword
is_exists = os.path.exists(path)
if not is_exists:
os.makedirs(path)
return path
else:
print(path + '目录已存在')
return path


def main():
keyword = input("input keyword about images you want to download: ")
num = input("input the number that you want: ")
path = make_dir(keyword)
run(keyword, path, num)


if __name__ == '__main__':
main()

爬百度资讯文章

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
import re
import os
from pyquery import PyQuery as pq


def get_page(url):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
}
response = requests.get(url=url,headers=headers)
response.encoding = "utf-8"
# print(response.status_code)
if response.status_code == 200:
return response.text
return None

if __name__ == '__main__':
keyword = input("input keyword about news that you want to download: ")
numStart = input("input the start number that you want: ")
numEnd = input("input the end number that you want: ")

for page in range(int(numStart), int(numEnd)):
url = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&medium=2&word=' + keyword + '&pn=' + str(page*10)
result = pq(get_page(url))
print('\nnow the page is '+str(page) + ' and the url is ' + url)
for item in range(result('.result').size()):
url = pq(result('.result').eq(item).html())('a').attr('href')
try:
# 下载文章
d = pq(url=url)
title = d('.article-title').text()
print('now the article is ' + title)
print('now the url is ' + url + '\n')
if not os.path.exists(title):
os.mkdir(title)
fobj = open(title + '/' + title + '.txt', 'w+')
fobj.write(d('.article-title').text())
for i in range(d('.bjh-p').size()):
fobj.write('\n' + d('.bjh-p').eq(i).text())
fobj.close()
# 下载图片
for imgIndex in range(d('.img-container').size()):
img_data = requests.get(pq(d('.img-container').eq(imgIndex).html())('img').attr('src'), timeout=10).content
with open(title + "/" + str(imgIndex) + ".jpg", "wb") as f:
f.write(img_data)
f.close()
except Exception as e:
print('there is a error when parsing this article, maybe it is not exist.')

比较目录差异

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import java.security.MessageDigest
import java.util.stream.Collectors

class SimilarCalculate {
static void main(String[] args) {
if (args == null || args.length < 2) {
return
}
String path1 = args[0]
String path2 = args[1]
compareByCode(path1, path2)
println("--------------------------------------")
compareByShell(path1, path2)
}

private static void compareByCode(String path1, String path2) {
def f1 = new File(path1)
def f2 = new File(path2)
int diffCount1, nonCount1, diffCount2, nonCount2
(diffCount1, nonCount1) = compareDir(f1, f2)
(diffCount2, nonCount2) = compareDir(f2, f1)
println("Value: $diffCount1, $nonCount1, $diffCount2, $nonCount2")
printResult(f1, diffCount1 + nonCount1 + nonCount2)
}

private static List<Integer> compareDir(File src, File des) {
if (!src.isDirectory() || !des.isDirectory()) {
return
}
int diffCount = 0
int nonCount = 0
src.eachFileRecurse {
if (it.isFile()) {
String subPath = src.relativePath(it)
File desFile = new File(des, subPath)
if (desFile.exists()) {
if (!fileSame(it, desFile)) {
diffCount++
}
} else {
nonCount++
}
}
}
return [diffCount, nonCount]
}

private static void printResult(File origin, int diff) {
int total = 0
origin.eachFileRecurse {
if (it.isFile()) {
total++
}
}
println("Diff: ${diff}, Total: ${total}, Percentage: ${diff * 1f / total}")
}

private static void compareByShell(String path1, String path2) {
def f1 = new File(path1)
def f2 = new File(path2)
createDir(f1, f2)
createDir(f2, f1)
String result = "diff -rq $path1 $path2".execute().text.trim()
printResult(f1, getLineNumberByIo(result))
}

private static int getLineNumberByIo(String target) {
LineNumberReader lnr = new LineNumberReader(new CharArrayReader(target.toCharArray()))
lnr.skip(Long.MAX_VALUE)
lnr.close()
return lnr.getLineNumber() + 1
}

private static void createDir(File src, File des) {
if (!src.isDirectory() || !des.isDirectory()) {
return
}
src.eachFileRecurse {
if (it.isDirectory()) {
String subPath = src.relativePath(it)
File newFile = new File(des, subPath)
if (!newFile.exists()) {
// println("Create path in ${des.absolutePath}: $subPath")
newFile.mkdirs()
}
}
}
}

// --------------------------------------------------------------------------------------------------------- //
private static boolean fileSame(File f1, File f2) {
return getMD5(f1) == getMD5(f2)
// String s1 = is2String(new FileInputStream(f1))
// String s2 = is2String(new FileInputStream(f2))
// return s1 == s2
}

private static String is2String(InputStream is) {
return new BufferedReader(new InputStreamReader(is)).lines().parallel().collect(Collectors.joining("\n"));
}

private static String getMD5(File file) {
FileInputStream fileInputStream = null
try {
MessageDigest MD5 = MessageDigest.getInstance("MD5")
fileInputStream = new FileInputStream(file)
byte[] buffer = new byte[8192]
int length
while ((length = fileInputStream.read(buffer)) != -1) {
MD5.update(buffer, 0, length)
}
return new String(encodeHex(MD5.digest()))
} catch (Exception e) {
e.printStackTrace()
return null
} finally {
try {
if (fileInputStream != null) {
fileInputStream.close()
}
} catch (IOException e) {
e.printStackTrace()
}
}
}

/**
* Used to build output as Hex
*/
private static final char[] DIGITS_LOWER =
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'];

/**
* Used to build output as Hex
*/
private static final char[] DIGITS_UPPER =
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'];

public static char[] encodeHex(final byte[] data) {
return encodeHex(data, true);
}

public static char[] encodeHex(final byte[] data, final boolean toLowerCase) {
return encodeHex(data, toLowerCase ? DIGITS_LOWER : DIGITS_UPPER);
}

protected static char[] encodeHex(final byte[] data, final char[] toDigits) {
final int l = data.length;
final char[] out = new char[l << 1];
// two characters form the hex value.
for (int i = 0, j = 0; i < l; i++) {
out[j++] = toDigits[(0xF0 & data[i]) >>> 4];
out[j++] = toDigits[0x0F & data[i]];
}
return out;
}
}