Skip to content

Commit fe22aa4

Browse files
committed
add chromdp proxy and update spider/core.go
1 parent 2d937e4 commit fe22aa4

File tree

7 files changed

+78
-65
lines changed

7 files changed

+78
-65
lines changed

chrome/doi.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,20 @@ import (
99
"github.com/openbiox/butils/log"
1010
stringo "github.com/openbiox/butils/stringo"
1111

12+
"github.com/chromedp/chromedp"
1213
cdp "github.com/chromedp/chromedp"
1314
)
1415

15-
func DoiSupplURLs(url string, timeout time.Duration) []string {
16+
func DoiSupplURLs(url string, timeout time.Duration, proxy string) []string {
1617
// create context
17-
ctx, _ := cdp.NewContext(context.Background())
18-
ctx, _ = context.WithTimeout(ctx, timeout)
19-
//defer cancel()
18+
o := append(cdp.DefaultExecAllocatorOptions[:],
19+
//... any options here
20+
chromedp.ProxyServer(proxy),
21+
)
22+
cx, cancel := chromedp.NewExecAllocator(context.Background(), o...)
23+
ctx, cancel := cdp.NewContext(cx)
24+
ctx, cancel = context.WithTimeout(ctx, timeout)
25+
defer cancel()
2026
var err error
2127
var attbs []map[string]string
2228
urls := []string{}
@@ -48,7 +54,7 @@ func visibleScienceDirect(host string, attbs *[]map[string]string) cdp.Tasks {
4854
cdp.Navigate(host),
4955
cdp.WaitVisible(`.show-toc-button`, cdp.ByQuery),
5056
cdp.Click(`.show-toc-button`, cdp.ByQuery),
51-
//cdp.WaitVisible(`a[href="#app2"]`, cdp.ByQuery),
57+
//cdp.WaitVisible(`a[href="#app2"]`, cdp.ByQuery),
5258
//cdp.Click(`a[href="#app2"]`, cdp.ByQuery),
5359
//cdp.WaitVisible(`#app2`, cdp.ByQuery),
5460
cdp.AttributesAll(".Appendices a.icon-link[href]", attbs, cdp.ByQueryAll),

doc/doi.list.journal.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ Rank Journal.Name Fulltext-1pass-check Supplemental-1pass-check DOI-1pass Fullte
7878
80 ANNALS OF INTERNAL MEDICINE bug 10.7326/M19-3111
7979
81 GASTROENTEROLOGY done yes 10.1186/s12876-019-1087-9
8080
82 INTENSIVE CARE MEDICINE done 10.1007/s00134-019-05829-0
81-
83 JOURNAL OF HEPATOLOGY done no 10.1016/j.jhep.2019.10.023
81+
83 JOURNAL OF HEPATOLOGY done yes 10.1016/j.jhep.2019.10.023
8282
84 Annual Review of Plant Biology done 10.1146/annurev-arplant-050718-100016
8383
85 PHARMACOLOGICAL REVIEWS done 10.1124/jpet.119.260968
84-
86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done no 10.1016/j.jacc.2019.09.001
84+
86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done yes 10.1016/j.jacc.2019.09.067
8585
87 Nature Reviews Rheumatology done yes 10.1038/s41584-019-0335-2
8686
88 Lancet Psychiatry todo 10.1016/S2215-0366(19)30394-3
8787
90 Chem todo no 10.1016/j.chempr.2019.06.020
@@ -2781,4 +2781,4 @@ Rank Journal.Name Fulltext-1pass-check Supplemental-1pass-check DOI-1pass Fullte
27812781
2903 PERMAFROST AND PERIGLACIAL PROCESSES NA
27822782
2903 SURGICAL ONCOLOGY-OXFORD NA
27832783
2903 Topics in Organometallic Chemistry NA
2784-
2903 TRANSFUSION MEDICINE AND HEMOTHERAPY NA
2784+
2903 TRANSFUSION MEDICINE AND HEMOTHERAPY NA

doc/doi.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -760,7 +760,7 @@ Supported DOI by bget (Journal)
760760
<td style="text-align: right;">83</td>
761761
<td style="text-align: left;">JOURNAL OF HEPATOLOGY</td>
762762
<td style="text-align: left;">done</td>
763-
<td style="text-align: left;">no</td>
763+
<td style="text-align: left;">yes</td>
764764
<td style="text-align: left;">10.1016/j.jhep.2019.10.023</td>
765765
<td style="text-align: left;">NA</td>
766766
<td style="text-align: left;">NA</td>
@@ -787,8 +787,8 @@ Supported DOI by bget (Journal)
787787
<td style="text-align: right;">86</td>
788788
<td style="text-align: left;">JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY</td>
789789
<td style="text-align: left;">done</td>
790-
<td style="text-align: left;">no</td>
791-
<td style="text-align: left;">10.1016/j.jacc.2019.09.001</td>
790+
<td style="text-align: left;">yes</td>
791+
<td style="text-align: left;">10.1016/j.jacc.2019.09.067</td>
792792
<td style="text-align: left;">NA</td>
793793
<td style="text-align: left;">NA</td>
794794
</tr>

go.mod

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,24 @@ go 1.12
55
require (
66
github.com/PuerkitoBio/goquery v1.5.0
77
github.com/andybalholm/cascadia v1.1.0 // indirect
8-
github.com/antchfx/htmlquery v1.1.0 // indirect
9-
github.com/antchfx/xmlquery v1.1.0 // indirect
10-
github.com/antchfx/xpath v1.1.0 // indirect
11-
github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90 // indirect
12-
github.com/chromedp/chromedp v0.4.0
8+
github.com/antchfx/htmlquery v1.2.0 // indirect
9+
github.com/antchfx/xmlquery v1.2.0 // indirect
10+
github.com/antchfx/xpath v1.1.1 // indirect
11+
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 // indirect
12+
github.com/chromedp/chromedp v0.5.1
1313
github.com/gocolly/colly v1.2.0
14+
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 // indirect
1415
github.com/google/go-github/v27 v27.0.6
1516
github.com/mattn/go-runewidth v0.0.6 // indirect
1617
github.com/olekukonko/tablewriter v0.0.2
1718
github.com/openbiox/butils v0.0.0-20191109071326-0a3e37c394b2
1819
github.com/pierrec/lz4 v2.3.0+incompatible // indirect
1920
github.com/spf13/cobra v0.0.5
2021
github.com/spf13/pflag v1.0.5 // indirect
21-
github.com/vbauerster/mpb/v4 v4.11.0
22-
golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a // indirect
23-
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 // indirect
22+
github.com/vbauerster/mpb/v4 v4.11.1
23+
golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f // indirect
24+
golang.org/x/net v0.0.0-20191118183410-d06c31c94cae // indirect
2425
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
25-
golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd // indirect
26+
golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2 // indirect
2627
google.golang.org/appengine v1.6.5 // indirect
2728
)

go.sum

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,24 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z
1111
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
1212
github.com/antchfx/htmlquery v1.0.0 h1:O5IXz8fZF3B3MW+B33MZWbTHBlYmcfw0BAxgErHuaMA=
1313
github.com/antchfx/htmlquery v1.0.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
14-
github.com/antchfx/htmlquery v1.1.0 h1:KMS88sLl5KP9GUVU2MQIDcQXNQ0M5MGlkC9WlYgAQqY=
15-
github.com/antchfx/htmlquery v1.1.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
14+
github.com/antchfx/htmlquery v1.2.0 h1:oKShnsGlnOHX6t4uj5OHgLKkABcJoqnXpqnscoi9Lpw=
15+
github.com/antchfx/htmlquery v1.2.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
1616
github.com/antchfx/xmlquery v1.0.0 h1:YuEPqexGG2opZKNc9JU3Zw6zFXwC47wNcy6/F8oKsrM=
1717
github.com/antchfx/xmlquery v1.0.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
18-
github.com/antchfx/xmlquery v1.1.0 h1:vj0kZ1y3Q6my4AV+a9xbWrMYzubw+84zuiKgvfV8vb8=
19-
github.com/antchfx/xmlquery v1.1.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
18+
github.com/antchfx/xmlquery v1.2.0 h1:1nrzsSN5mFrlqFWSK9byiq/qXKE7O2vivYzhv1Ksnfw=
19+
github.com/antchfx/xmlquery v1.2.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
2020
github.com/antchfx/xpath v1.0.0 h1:Q5gFgh2O40VTSwMOVbFE7nFNRBu3tS21Tn0KAWeEjtk=
2121
github.com/antchfx/xpath v1.0.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
22-
github.com/antchfx/xpath v1.1.0 h1:mJTvYpiHvxNQRD4Lbfin/FodHVCHh2a5KrOFr4ZxMOI=
23-
github.com/antchfx/xpath v1.1.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
22+
github.com/antchfx/xpath v1.1.1 h1:mqGYmd5pioPu06+REIf8j3y6O3S1UpVNVoCameZHotg=
23+
github.com/antchfx/xpath v1.1.1/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
2424
github.com/antonfisher/nested-logrus-formatter v1.0.2 h1:t65eOqj0fWbOkZR2+OgmxPa0KYIwbPhKdYmseaCMIyI=
2525
github.com/antonfisher/nested-logrus-formatter v1.0.2/go.mod h1:6WTfyWFkBc9+zyBaKIqRrg/KwMqBbodBjgbHjDz7zjA=
2626
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
27-
github.com/chromedp/cdproto v0.0.0-20190812224334-39ef923dcb8d h1:00kLGv5nKzpFchNhGDXDRbKtYx/WoT983Ka2t8/pzRE=
28-
github.com/chromedp/cdproto v0.0.0-20190812224334-39ef923dcb8d/go.mod h1:0YChpVzuLJC5CPr+x3xkHN6Z8KOSXjNbL7qV8Wc4GW0=
29-
github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90 h1:CgIuU+BmhL7FOXl4nTH3L1pwPbAz1VlzexJNEfrS7Kw=
30-
github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90/go.mod h1:0YChpVzuLJC5CPr+x3xkHN6Z8KOSXjNbL7qV8Wc4GW0=
31-
github.com/chromedp/chromedp v0.4.0 h1:0AJC5ejETuh/6n7Tcsw4u4G0eKZkI9aVRwckWaImLUE=
32-
github.com/chromedp/chromedp v0.4.0/go.mod h1:DC3QUn4mJ24dwjcaGQLoZrhm4X/uPHZ6spDbS2uFhm4=
27+
github.com/chromedp/cdproto v0.0.0-20191009033829-c22f49c9ff0a/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
28+
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 h1:QD3KxSJ59L2lxG6MXBjNHxiQO2RmxTQ3XcK+wO44WOg=
29+
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
30+
github.com/chromedp/chromedp v0.5.1 h1:PAqhoCWCHzRphYnmmxLSiYk7EEwDplCm4woTCCaV2cQ=
31+
github.com/chromedp/chromedp v0.5.1/go.mod h1:3NMfuKTrKNr8PWEvHzdzZ57PK4jm9zW1C5nKiaWdxcM=
3332
github.com/coreos/etcd v3.3.10+incompatible h1:jFneRYjIvLMLhDLCzuTuU4rSJUjRplcJQ7pD7MnhC04=
3433
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
3534
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
@@ -55,6 +54,8 @@ github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
5554
github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
5655
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
5756
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
57+
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 h1:uHTyIjqVhYRhLbJ8nIiOJHkEZZ+5YoOsAbD3sk82NiE=
58+
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
5859
github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
5960
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
6061
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
@@ -77,8 +78,8 @@ github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8Nz
7778
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
7879
github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
7980
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
80-
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
81-
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ=
81+
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs=
82+
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0=
8283
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
8384
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
8485
github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
@@ -89,8 +90,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
8990
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
9091
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
9192
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
92-
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e h1:hB2xlXdHp/pmPZq0y3QnmWAArdw9PqbmotexnWx/FU8=
93-
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
93+
github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM=
94+
github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
9495
github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y=
9596
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
9697
github.com/mattn/go-runewidth v0.0.6 h1:V2iyH+aX9C5fsYCpK60U8BYIvmhqxuOL3JZcqc1NB7k=
@@ -145,8 +146,8 @@ github.com/ulikunitz/xz v0.5.6 h1:jGHAfXawEGZQ3blwU5wnWKQJvAraT7Ftq9EXjnXYgt8=
145146
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
146147
github.com/vbauerster/mpb/v4 v4.9.3 h1:fZv72LoQvz8Pz6OeqUSJr62kMCQDHyOtuY0nl93CcJM=
147148
github.com/vbauerster/mpb/v4 v4.9.3/go.mod h1:xMKSr3w3dixpCH9v7svY4wF3mmhuyWYuYtkpy8T5FOk=
148-
github.com/vbauerster/mpb/v4 v4.11.0 h1:QdSmlc4dUap9XugHWx84yi7ABstYHW1rC5slzDwfXnw=
149-
github.com/vbauerster/mpb/v4 v4.11.0/go.mod h1:2d50DYyCBW+8eE9ZgdMCDEB+7S+ELz4YenPtQ+nKOts=
149+
github.com/vbauerster/mpb/v4 v4.11.1 h1:ZOYQSVHgmeanXsbyC44aDg76tBGCS/54Rk8VkL8dJGA=
150+
github.com/vbauerster/mpb/v4 v4.11.1/go.mod h1:vMLa1J/ZKC83G2lB/52XpqT+ZZtFG4aZOdKhmpRL1uM=
150151
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
151152
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
152153
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
@@ -156,10 +157,9 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
156157
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
157158
golang.org/x/crypto v0.0.0-20190829043050-9756ffdc2472 h1:Gv7RPwsi3eZ2Fgewe3CBsuOebPwO27PoXzRpJPsvSSM=
158159
golang.org/x/crypto v0.0.0-20190829043050-9756ffdc2472/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
159-
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
160-
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
161-
golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a h1:R/qVym5WAxsZWQqZCwDY/8sdVKV1m1WgU4/S5IRQAzc=
162-
golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
160+
golang.org/x/crypto v0.0.0-20191112222119-e1110fd1c708/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
161+
golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f h1:kz4KIr+xcPUsI3VMoqWfPMvtnJ6MGfiVwsWSVzphMO4=
162+
golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
163163
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
164164
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
165165
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -171,8 +171,8 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65 h1:+rhAzEzT3f4JtomfC371qB+0O
171171
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
172172
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297 h1:k7pJ2yAPLPgbskkFdhRCsA77k2fySZ1zf2zCjvQCiIM=
173173
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
174-
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 h1:bHNaocaoJxYBo5cw41UyTMLjYlb8wPY7+WFrnklbHOM=
175-
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
174+
golang.org/x/net v0.0.0-20191118183410-d06c31c94cae h1:AzDIJnLFoW3GaQvpbMRKk+SptYRYtnhYdyuX+S/dTbc=
175+
golang.org/x/net v0.0.0-20191118183410-d06c31c94cae/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
176176
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
177177
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0=
178178
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -185,13 +185,12 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
185185
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
186186
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
187187
golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
188-
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a h1:aYOabOQFp6Vj6W1F80affTUvO9UxmJRx8K0gsfABByQ=
189-
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
190188
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFWLFaHyYZWD4tOT+cjn0=
191189
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
192-
golang.org/x/sys v0.0.0-20191025090151-53bf42e6b339/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
193-
golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd h1:3x5uuvBgE6oaXJjCOvpCC1IpgJogqQ+PqGGU3ZxAgII=
194-
golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
190+
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
191+
golang.org/x/sys v0.0.0-20191113165036-4c7a9d0fe056/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
192+
golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2 h1:wAW1U21MfVN0sUipAD8952TBjGXMRHFKQugDlQ9RwwE=
193+
golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
195194
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
196195
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
197196
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=

spider/core.go

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,28 @@ func NatureComSpider(opt *DoiSpiderOpt) (urls []string) {
2828
if opt.FullText {
2929
c.OnHTML("a.c-pdf-download__link[href]", func(e *colly.HTMLElement) {
3030
link := e.Attr("href")
31-
urls = append(urls, "https://nature.com"+link)
31+
urls = append(urls, linkFilter(link, opt.URL))
3232
})
3333
}
3434
if opt.Supplementary {
3535
c.OnHTML("a.print-link[href]", func(e *colly.HTMLElement) {
3636
link := e.Attr("href")
37-
if !strings.HasPrefix(link, "http") {
38-
urls = append(urls, "https://nature.com"+link)
39-
} else {
40-
u, _ := url.Parse(link)
41-
linkTmp := strings.Split(u.Path, "/")
42-
if len(linkTmp) < 4 {
43-
return
37+
if !strings.Contains(link, "/figures/") {
38+
if !strings.HasPrefix(link, "http") {
39+
urls = append(urls, linkFilter(link, opt.URL))
40+
} else {
41+
u, _ := url.Parse(link)
42+
linkTmp := strings.Split(u.Path, "/")
43+
if len(linkTmp) < 4 {
44+
return
45+
}
46+
linkTmp[2] = stringo.StrReplaceAll(linkTmp[2], "art:", "art%3A")
47+
newLink := append(linkTmp[0:2], strings.Join(linkTmp[2:4], "%2F"))
48+
newLink = append(newLink, linkTmp[4:len(linkTmp)]...)
49+
link = strings.Join(newLink, "/")
50+
link = u.Scheme + "://" + u.Host + link
51+
urls = append(urls, link)
4452
}
45-
linkTmp[2] = stringo.StrReplaceAll(linkTmp[2], "art:", "art%3A")
46-
newLink := append(linkTmp[0:2], strings.Join(linkTmp[2:4], "%2F"))
47-
newLink = append(newLink, linkTmp[4:len(linkTmp)]...)
48-
link = strings.Join(newLink, "/")
49-
link = u.Scheme + "://" + u.Host + link
50-
urls = append(urls, link)
5153
}
5254
})
5355
}
@@ -108,7 +110,8 @@ func CellComSpider(opt *DoiSpiderOpt) []string {
108110
c := colly.NewCollector(
109111
colly.AllowedDomains("doi.org", "www.cell.com", "cell.com", "linkinghub.elsevier.com", "secure.jbs.elsevierhealth.com",
110112
"id.elsevier.com", "www.cancercell.org", "www.sciencedirect.com",
111-
"pdf.sciencedirectassets.com", "www.thelancet.com", "www.gastrojournal.org"),
113+
"pdf.sciencedirectassets.com", "www.thelancet.com", "www.gastrojournal.org",
114+
"www.clinicalkey.com"),
112115
colly.MaxDepth(1),
113116
)
114117
bspider.SetSpiderProxy(c, opt.Proxy, opt.Timeout)
@@ -175,7 +178,7 @@ func CellComSpider(opt *DoiSpiderOpt) []string {
175178
c.Visit(fmt.Sprintf("https://doi.org/%s", opt.Doi))
176179
if opt.Supplementary {
177180
urls = append(urls, chrome.DoiSupplURLs(fmt.Sprintf("https://doi.org/%s", opt.Doi),
178-
time.Duration(opt.Timeout)*time.Second)...)
181+
time.Duration(opt.Timeout)*time.Second, opt.Proxy)...)
179182
c.OnHTML("#appsec1 a[target=new]", func(e *colly.HTMLElement) {
180183
link := e.Attr("href")
181184
urls = append(urls, link)

spider/universal.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ func UniVersalDoiSpider(opt *DoiSpiderOpt) (urls []string) {
3535
link = stringo.StrReplaceAll(link, "pdf[?].*", "pdf")
3636
urls = append(urls, linkFilter(link, opt.URL))
3737
})
38+
c.OnHTML("a.article-pdfLink[data-article-url]", func(e *colly.HTMLElement) {
39+
link := e.Attr("data-article-url")
40+
urls = append(urls, linkFilter(link, opt.URL))
41+
})
3842
staticUrl := static2pdf(opt)
3943
if staticUrl != "" {
4044
urls = append(urls, linkFilter(staticUrl, opt.URL))

0 commit comments

Comments
 (0)