It may be a stupid question because I just learned Golang. I hope you understand.
I am making a program to extract data from the homepage using the goquery
package:
package main
import (
"fmt"
"log"
"net/http"
"github.com/PuerkitoBio/goquery"
)
var url string = "https://www.jobkorea.co.kr/Search/?stext=golang&tabType=recruit&Page_No=3"
func main() {
getPages()
}
func getPages() int {
res, err := http.Get(url)
checkErr(err)
checkCode(res)
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)
doc.Find(".tplPagination").Each(func(i int, s *goquery.Selection) {
fmt.Println(s.Find("a"))
})
return 0
}
func checkErr(err error) {
if err != nil {
log.Fatalln(err)
fmt.Println(err)
}
}
func checkCode(res *http.Response) {
if res.StatusCode != 200 {
log.Fatalln("Request failed with statusCode:", res.StatusCode)
}
}
It prints below:
&{[0x140002db0a0 0x140002db570 0x140002db810 0x140002dbd50 0x140002dc000 0x140002dc2a0 0x140002dc540 0x140002dc850] 0x140000b2438 0x14000305680}
&{[0x140002dcd90 0x140002dd810] 0x140000b2438 0x14000305710}
But I just want to print only the first array out. Like this:
[0x140002dcd90 0x140002dd810]
How can I destruct them?
CodePudding user response:
The problem is that you are printing as result is matched.
You can save the *goquery.Selection
in a new slice and print only the last element. This example is working because you want the last occurrence, but in real life you must parse the query result for something in specific to not depend about result order.
// type Selection struct {
// Nodes []*html.Node
// document *Document
// prevSel *Selection
// }
var temp []*goquery.Selection
temp = append(temp, doc.Find(".tplPagination").Each(func(i int, s *goquery.Selection) {
s.Find("a")
}))
fmt.Printf("last: %v\n", temp[len(temp)-1])
temp[len(temp)-1]: &{[0xc0002dcd90 0xc0002e0a80] 0xc00000e3f0 0xc000309770}
The Nodes []*html.Node
can be accessed with same example:
fmt.Printf("last: %v\n", temp[len(temp)-1].Nodes)
CodePudding user response:
As per your comment you were looking to parse the page and get the number of pages and number of posts. Here is my attempt:
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"math"
"net/http"
"strconv"
"strings"
)
func errCheck(err error) {
if err != nil {
log.Fatal(err)
}
}
func ExampleScrape() {
url := "https://www.jobkorea.co.kr/Search/?stext=golang&tabType=recruit&Page_No=%s"
page := 3
fmt.Println("Current page:", page)
res, err := http.Get(fmt.Sprintf(url, page))
errCheck(err)
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
errCheck(err)
posts_div := doc.Find(".recruit-info div.dev_list.lists-cnt")
total_count_div := posts_div.Nodes[0]
var total_count int
for _, a := range total_count_div.Attr {
if a.Key == "total-count" {
total_count, err = strconv.Atoi(a.Val)
errCheck(err)
break
}
}
fmt.Println("Total count:", total_count)
titles := posts_div.Find(".list-post .title")
fmt.Println("On this page:", len(titles.Nodes))
fmt.Println("Pages:", math.Ceil(float64(total_count)/float64(len(titles.Nodes))))
fmt.Println("\nTitles on this page:")
titles.Each(func(i int, s *goquery.Selection) {
fmt.Println("\t-", strings.TrimSpace(s.Text()))
})
}
func main() {
ExampleScrape()
}