学习文章

https://blog.csdn.net/weixin_47533648/article/details/131454751
https://zhuanlan.zhihu.com/p/669412149
https://www.jb51.net/jiaoben/293230ysr.htm#_lab2_1_5
https://blog.csdn.net/weixin_43955067/article/details/130752722

1.Get请求

1.1基础Get请求

package main  

import (
"fmt"
"io/ioutil"
"net/http"
)

func main() {
resp, err := http.Get("http://www.baidu.com")
if err != nil {
fmt.Printf("get failed, err:%v\n", err)
return
}
// 程序在使用完response后必须关闭回复的主体
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("read from resp.Body failed, err:%v\n", err)
return
}
fmt.Print(string(body))
}

2.带Get型参数

package main  

import (
"fmt"
"io"
"net/http"
"net/url"
)

func main() {
apiUrl := "http://127.0.0.1:9090/get"
// URL param
data := url.Values{}
data.Set("name", "mi")
data.Set("age", "18")
u, err := url.ParseRequestURI(apiUrl)
if err != nil {
fmt.Printf("parse url requestUrl failed, err:%v\n", err)
}
// URL encode
u.RawQuery = data.Encode()
// http://127.0.0.1:9090/get?age=18&name=mi
fmt.Println(u.String())
resp, err := http.Get(u.String())
if err != nil {
fmt.Printf("post failed, err:%v\n", err)
return
}
defer resp.Body.Close()
b, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("get resp failed, err:%v\n", err)
return
}
// {"status": "ok"}
fmt.Println(string(b))
}

3.请求头修改

package main  

import (
"fmt"
"net/http" "net/http/httputil")

func main() {
url := "http://127.0.0.1:9090/get"
request, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
panic(err)
}
request.Header.Add("Authorization", "jhs8723sd2dshd2")
request.Header.Add("User-Agent", "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
resp, err := http.DefaultClient.Do(request)
if err != nil {
panic(err)
}
defer resp.Body.Close()
// //获取网页内容
s, err := httputil.DumpResponse(resp, true)
if err != nil {
panic(err)
}
fmt.Printf("%s", s)
}

2.Post请求

1.基础Post请求

package main  

import (
"fmt"
"io" "net/http" "strings")

func main() {
url := "http://127.0.0.1:9090/post"
// 表单数据
// contentType := "application/x-www-form-urlencoded"
// data := "name=mi&age=18" // json contentType := "application/json"
data := `{"name":"mi","age":18}`
resp, err := http.Post(url, contentType, strings.NewReader(data))
if err != nil {
fmt.Printf("post failed, err:%v\n", err)
return
}
defer resp.Body.Close()
b, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("get resp failed, err:%v\n", err)
return
}
// {"status": "ok"}
fmt.Println(string(b))
}

2.Postform

package main  

import (
"fmt"
"io" "net/http" "net/url")

func main() {
resp, err := http.PostForm("http://127.0.0.1:9090/post", url.Values{"key": {"Value"}, "id": {"123"}})
if err != nil {
fmt.Println(err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println(err)
}
fmt.Println(string(body))
}

传json

package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
)
func main() {
data := make(map[string]interface{})
data["site"] = "www.baidu.com"
data["name"] = "tom"
bytesData, _ := json.Marshal(data)
resp, _ := http.Post("http://httpbin.org/post", "application/json", bytes.NewReader(bytesData))
body, _ := ioutil.ReadAll(resp.Body)
fmt.Println(string(body))
}

3.请求头修改

package main  

import (
"fmt"
"io"
"net/http"
"strings"
)

func main() {
client := &http.Client{}
req, err := http.NewRequest("POST", "http://www.01happy.com/demo/accept.php", strings.NewReader("name=aa"))
if err != nil {
fmt.Println(err)
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
req.Header.Set("Cookie", "name=anny")
resp, err := client.Do(req)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println(err)
}
fmt.Println(string(body))
}

3.Client

1.client

client可根据需要自定义行为,比如设置超时时间、代理、transport等

package main  

import (
"fmt"
"net/http"
"time"
)

func main() {
// 创建一个自定义的 http.Client
client := &http.Client{
Timeout: 10 * time.Second, // 设置超时时间为10秒
// 你还可以添加其他自定义的配置,比如 Transport、CheckRedirect 等
}

// 使用自定义的 Client 发起 HTTP 请求
resp, err := client.Get("http://example.com")
if err != nil {
fmt.Println("Error:", err)
return
}
defer resp.Body.Close()

// 处理响应
// ...
}

checkRediret

package main

import (
"fmt"
"net/http"
)

func main() {
// 创建一个自定义的 http.Client,并设置 CheckRedirect 函数
client := &http.Client{
CheckRedirect: redirectPolicyFunc,
}

// 使用自定义的 Client 发起 HTTP 请求
resp, err := client.Get("http://example.com")
if err != nil {
fmt.Println("Error:", err)
return
}
defer resp.Body.Close()
}

// 定义一个自定义的重定向策略函数
func redirectPolicyFunc(req *http.Request, via []*http.Request) error {
// 返回一个错误值,以停止重定向
return http.ErrUseLastResponse
}

2.transport

可以设置代理、TLS、连接池等

package main  

import (
"crypto/tls"
"fmt"
"io"
"net"
"net/http"
"time"
)

func main() {
// 创建一个自定义的 Transport tr := &http.Transport{
// 设置代理
Proxy: http.ProxyFromEnvironment,
// 设置 TLS 客户端配置
//跳过安全检查
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
// 设置最大空闲连接数
MaxIdleConns: 10,
// 设置每个主机的最大空闲连接数
MaxIdleConnsPerHost: 10,
// 设置连接超时时间
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
}

// 创建一个自定义的 http.Client,并使用自定义的 Transport
client := &http.Client{
Transport: tr,
}

// 使用自定义的 Client 发起 HTTP 请求
resp, err := client.Get("http://xrect1fy.github.io")
if err != nil {
fmt.Println("Error:", err)
return
}
defer resp.Body.Close()
e, _ := io.ReadAll(resp.Body)
fmt.Println(string(e))
}

代理

package main  

import (
"fmt"
"io"
"net/http"
"net/url"
)

func main() {
// 创建一个自定义的 Transport tr := &http.Transport{
// 设置代理
Proxy: func(req *http.Request) (*url.URL, error) {
// 将请求重定向到8080端口的代理
proxyURL, err := url.Parse("http://127.0.0.1:8080")
if err != nil {
return nil, err
}
return proxyURL, nil
},
}

// 创建一个自定义的 http.Client,并使用自定义的 Transport
client := &http.Client{
Transport: tr,
}

// 使用自定义的 Client 发起 HTTP 请求
resp, err := client.Get("http://xrect1fy.github.io")
if err != nil {
fmt.Println("Error:", err)
return
}
defer resp.Body.Close()
e, _ := io.ReadAll(resp.Body)
fmt.Println(string(e))
}

4. 内容匹配

1.regexp正则匹配
https://blog.csdn.net/weixin_47533648/article/details/131454751

2.golang.org/x/net/html 匹配节点
https://blog.csdn.net/weixin_43955067/article/details/130752722

5.goroutine提速

https://zhuanlan.zhihu.com/p/669412149

package main  

import (
"fmt"
"golang.org/x/net/html"
"io"
"net/http"
"strings"
"sync"
)

func main() {
urls := []string{"http://xrect1fy.github.io", "http://baidu.com", "http://bilibili.com"} // 要爬取的网页链接

var wg sync.WaitGroup
for _, url := range urls {
wg.Add(1)
go func(u string) {
defer wg.Done()
data, err := fetchPage(u)
if err != nil {
fmt.Println("Error fetching page:", err)
return
}
// 解析页面数据
links := parseLinks(data)
fmt.Println("Links on", u, ":", links)
}(url)
}
wg.Wait()
}

func fetchPage(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()

// 读取页面内容
data, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(data), nil
}

func parseLinks(data string) []string {
// 使用golang.org/x/net/html包解析HTML页面,提取链接
links := make([]string, 0)
tokenizer := html.NewTokenizer(strings.NewReader(data))
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
break
}
token := tokenizer.Token()
if tokenType == html.StartTagToken && token.Data == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
links = append(links, attr.Val)
}
}
}
}
return links
}

参考文章

https://blog.csdn.net/weixin_47533648/article/details/131454751
https://zhuanlan.zhihu.com/p/669412149
https://www.jb51.net/jiaoben/293230ysr.htm#_lab2_1_5
https://blog.csdn.net/weixin_43955067/article/details/130752722