Skip to content

Commit

Permalink
[fix] update instagram scraper according to site changes
Browse files Browse the repository at this point in the history
closes #126
  • Loading branch information
asciimoo committed May 12, 2018
1 parent 640b5f3 commit d2a2da4
Showing 1 changed file with 114 additions and 59 deletions.
173 changes: 114 additions & 59 deletions _examples/instagram/instagram.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package main

import (
"bytes"
"crypto/md5"
"encoding/json"
"fmt"
"log"
"net/url"
"os"
"strings"

Expand All @@ -12,16 +15,69 @@ import (

// found in https://www.instagram.com/static/bundles/en_US_Commons.js/68e7390c5938.js
// included from profile page
const instagramQueryId = "17888483320059182"
const instagramQueryId = "42323d64886122307be10013ad2dcc45"

// "id": user id, "after": end cursor
const nextPageURLTemplate string = `https://www.instagram.com/graphql/query/?query_id=17888483320059182&variables={"id":"%s","first":12,"after":"%s"}`
const nextPageURL string = `https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s`
const nextPagePayload string = `{"id":"%s","first":12,"after":"%s"}`

var requestID string

type pageInfo struct {
EndCursor string `json:"end_cursor"`
NextPage bool `json:"has_next_page"`
}

type mainPageData struct {
Rhxgis string `json:"rhx_gis"`
EntryData struct {
ProfilePage []struct {
Graphql struct {
User struct {
Id string `json:"id"`
Media struct {
Edges []struct {
Node struct {
ImageURL string `json:"display_url"`
ThumbnailURL string `json:"thumbnail_src"`
IsVideo bool `json:"is_video"`
Date int `json:"date"`
Dimensions struct {
Width int `json:"width"`
Height int `json:"height"`
} `json:"dimensions"`
} `json::node"`
} `json:"edges"`
PageInfo pageInfo `json:"page_info"`
} `json:"edge_owner_to_timeline_media"`
} `json:"user"`
} `json:"graphql"`
} `json:"ProfilePage"`
} `json:"entry_data"`
}

type nextPageData struct {
Data struct {
User struct {
Container struct {
PageInfo pageInfo `json:"page_info"`
Edges []struct {
Node struct {
ImageURL string `json:"display_url"`
ThumbnailURL string `json:"thumbnail_src"`
IsVideo bool `json:"is_video"`
Date int `json:"taken_at_timestamp"`
Dimensions struct {
Width int `json:"width"`
Height int `json:"height"`
}
}
} `json:"edges"`
} `json:"edge_owner_to_timeline_media"`
}
} `json:"data"`
}

func main() {
if len(os.Args) != 2 {
log.Println("Missing account name argument")
Expand All @@ -33,56 +89,68 @@ func main() {
outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount)

c := colly.NewCollector(
colly.CacheDir("./_instagram_cache/"),
//colly.CacheDir("./_instagram_cache/"),
colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
)

c.OnHTML("body > script:first-of-type", func(e *colly.HTMLElement) {
jsonData := e.Text[strings.Index(e.Text, "{") : len(e.Text)-1]
data := struct {
EntryData struct {
ProfilePage []struct {
User struct {
Id string `json:"id"`
Media struct {
Nodes []struct {
ImageURL string `json:"display_src"`
ThumbnailURL string `json:"thumbnail_src"`
IsVideo bool `json:"is_video"`
Date int `json:"date"`
Dimensions struct {
Width int `json:"width"`
Height int `json:"height"`
}
}
PageInfo pageInfo `json:"page_info"`
} `json:"media"`
} `json:"user"`
} `json:"ProfilePage"`
} `json:"entry_data"`
}{}
err := json.Unmarshal([]byte(jsonData), &data)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("X-Requested-With", "XMLHttpRequest")
r.Headers.Set("Referrer", "https://www.instagram.com/"+instagramAccount)
if r.Ctx.Get("gis") != "" {
gis := fmt.Sprintf("%s:%s", r.Ctx.Get("gis"), r.Ctx.Get("variables"))
h := md5.New()
h.Write([]byte(gis))
gisHash := fmt.Sprintf("%x", h.Sum(nil))
r.Headers.Set("X-Instagram-GIS", gisHash)
}
})

c.OnHTML("html", func(e *colly.HTMLElement) {
d := c.Clone()
d.OnResponse(func(r *colly.Response) {
idStart := bytes.Index(r.Body, []byte(`:n},queryId:"`))
requestID = string(r.Body[idStart+13 : idStart+45])
})
requestIDURL := e.Request.AbsoluteURL(e.ChildAttr(`link[as="script"]`, "href"))
d.Visit(requestIDURL)

dat := e.ChildText("body > script:first-of-type")
jsonData := dat[strings.Index(dat, "{") : len(dat)-1]
data := &mainPageData{}
err := json.Unmarshal([]byte(jsonData), data)
if err != nil {
log.Fatal(err)
}

log.Println("saving output to ", outputDir)
os.MkdirAll(outputDir, os.ModePerm)
page := data.EntryData.ProfilePage[0]
actualUserId = page.User.Id
for _, obj := range page.User.Media.Nodes {
actualUserId = page.Graphql.User.Id
for _, obj := range page.Graphql.User.Media.Edges {
// skip videos
if obj.IsVideo {
if obj.Node.IsVideo {
continue
}
c.Visit(obj.ImageURL)
c.Visit(obj.Node.ImageURL)
}
if page.User.Media.PageInfo.NextPage {
log.Println("Next page found")
c.Visit(fmt.Sprintf(nextPageURLTemplate, actualUserId, page.User.Media.PageInfo.EndCursor))
nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, page.Graphql.User.Media.PageInfo.EndCursor)
e.Request.Ctx.Put("variables", nextPageVars)
if page.Graphql.User.Media.PageInfo.NextPage {
u := fmt.Sprintf(
nextPageURL,
requestID,
url.QueryEscape(nextPageVars),
)
log.Println("Next page found", u)
e.Request.Ctx.Put("gis", data.Rhxgis)
e.Request.Visit(u)
}
})

c.OnError(func(r *colly.Response, e error) {
log.Println("error:", e, r.Request.URL, string(r.Body))
})

c.OnResponse(func(r *colly.Response) {
if strings.Index(r.Headers.Get("Content-Type"), "image") > -1 {
r.Save(outputDir + r.FileName())
Expand All @@ -93,28 +161,8 @@ func main() {
return
}

data := struct {
Data struct {
User struct {
Container struct {
PageInfo pageInfo `json:"page_info"`
Edges []struct {
Node struct {
ImageURL string `json:"display_url"`
ThumbnailURL string `json:"thumbnail_src"`
IsVideo bool `json:"is_video"`
Date int `json:"taken_at_timestamp"`
Dimensions struct {
Width int `json:"width"`
Height int `json:"height"`
}
}
} `json:"edges"`
} `json:"edge_owner_to_timeline_media"`
}
} `json:"data"`
}{}
err := json.Unmarshal(r.Body, &data)
data := &nextPageData{}
err := json.Unmarshal(r.Body, data)
if err != nil {
log.Fatal(err)
}
Expand All @@ -127,8 +175,15 @@ func main() {
c.Visit(obj.Node.ImageURL)
}
if data.Data.User.Container.PageInfo.NextPage {
log.Println("Next page found")
c.Visit(fmt.Sprintf(nextPageURLTemplate, actualUserId, data.Data.User.Container.PageInfo.EndCursor))
nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, data.Data.User.Container.PageInfo.EndCursor)
r.Request.Ctx.Put("variables", nextPageVars)
u := fmt.Sprintf(
nextPageURL,
requestID,
url.QueryEscape(nextPageVars),
)
log.Println("Next page found", u)
r.Request.Visit(u)
}
})

Expand Down

0 comments on commit d2a2da4

Please sign in to comment.