summaryrefslogtreecommitdiff
path: root/scripts/reddit-scrape
blob: 6ea7f72d22fd05eeb4c010575b8b596b2791b01c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/sh

#cfg
useragent="Love by u/gadelat"
timeout=60

subreddit=$1
sort=$2
top_time=$3

if [ -z $sort ]; then
    sort="hot"
fi

if [ -z $top_time ];then
    top_time=""
fi

url="https://www.reddit.com/r/$subreddit/$sort/.json?raw_json=1&t=$top_time"
content=`wget -T $timeout -U "$useragent" -q -O - $url`
mkdir -p $subreddit
while : ; do
    urls=$(echo -n "$content"| jq -r '.data.children[]|select(.data.post_hint|test("image")?) | .data.preview.images[0].source.url')
    names=$(echo -n "$content"| jq -r '.data.children[]|select(.data.post_hint|test("image")?) | .data.title')
    ids=$(echo -n "$content"| jq -r '.data.children[]|select(.data.post_hint|test("image")?) | .data.id')
    a=1
    wait # prevent spawning too many processes
    for url in $urls; do
        name=`echo -n "$names"|sed -n "$a"p`
        id=`echo -n "$ids"|sed -n "$a"p`
        ext=`echo -n "${url##*.}"|cut -d '?' -f 1`
        newname=`echo $name | sed "s/^\///;s/\// /g"`_"$subreddit"_$id.$ext
        echo $name
        wget -T $timeout -U "$useragent" --no-check-certificate -nv -nc -P down -O "$subreddit/$newname" $url &>/dev/null &
        a=$(($a+1))
    done
    after=$(echo -n "$content"| jq -r '.data.after//empty')
    if [ -z $after ]; then
        break
    fi
    url="https://www.reddit.com/r/$subreddit/$sort/.json?count=200&after=$after&raw_json=1&t=$top_time"
    content=`wget -T $timeout -U "$useragent" --no-check-certificate -q -O - $url`
done