最近在整理手头的书籍,想完善书籍的信息,又不想手动整理。

下面是抓取豆瓣书籍详情的一些步骤

  1. 通过 fetch( https ://book.douban.com/isbn/${isbn}) 获取详情页信息,
    其中几个 meta 标签有书籍的相关信息。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17

    <meta property="og:title" content="同意" />
    <meta property="og:description" content="是的,我同意了,那年我十四岁。
    他们说我不是受害者,而是同谋。

    推动法国性同意立法的现象级图书
    授权29种语言 法语版销量超30万
    让-雅克·卢梭自传奖| ELLE杂志读者大奖非虚构奖


    这是..." />
    <meta property="og:site_name" content="豆瓣" />
    <meta property="og:url" content="https://book.douban.com/subject/35947066/" />
    <meta property="og:image" content="https://img9.doubanio.com/view/subject/l/public/s34390216.jpg" />
    <meta property="og:type" content="book" />
    <meta property="book:author" content="[法] 瓦内莎·斯普林格拉" />
    <meta property="book:isbn" content="9787549637300" />
  2. 通过正则处理并提取。

正则如下

1
/<meta property="([\w:]+)" content="([\w\\u4e00-\\u9fa5\S\n]+)" \/>/g
  1. 创建一个 class 来专门处理
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20

    class MetadataParser {
    constructor(htmlCode) {
    this.htmlCode = htmlCode;
    }

    parse() {
    const regex = /<meta property="([\w:]+)" content="([\w\\u4e00-\\u9fa5\S\n]+)" \/>/g
    const result = [];
    let match;

    while ((match = regex.exec(this.htmlCode)) !== null) {
    result.push({ property: match[1], content: match[2] });
    }
    return result;
    }
    jsonStr() {
    return JSON.stringify(this.parse())
    }
    }

4.最终代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

class MetadataParser {
constructor(htmlCode) {
this.htmlCode = htmlCode;
}

parse() {
const regex = /<meta property="([\w:]+)" content="([\w\\u4e00-\\u9fa5\S\n]+)" \/>/g
const result = [];
let match;

while ((match = regex.exec(this.htmlCode)) !== null) {
result.push({ property: match[1], content: match[2] });
}
return result;
}
jsonStr() {
return JSON.stringify(this.parse())
}
}

async function handler(_req) {
// Create a post request

const {isbn} = await _req.json()

const reg = /<meta property="([\w:]+)" content="([\w\\u4e00-\\u9fa5\S\n]+)" \/>/g

const request = new Request(`https://book.douban.com/isbn/${isbn}`, {
"headers": {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
},
"body": null,
"method": "GET"
})


const resdata = await fetch(request).then(res => res.text())

console.log(resdata)


// 使用示例
const parser = new MetadataParser(resdata);

console.log(parser.parse());

const response = new Response(parser.jsonStr(), {
status: 200,
headers: {
"content-type": "text/html",
},
});

console.log(response.status); // 200
console.log(response.headers.get("content-type")); // text/html

return response;

}

Deno.serve(handler);