imagescraper/imagescraper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

#!/usr/bin/env python3

"""
imagescraper.py is part of animedb.
The purpose of this program is to scrape images from MAL for each anime inside our database.
Copyright (C) 2018 Daniel Jones daniel@danieljon.es 

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""

import argparse;
import sqlite3;
import os.path;
import urllib.request;
import re;

def dbconnect(dbname):
    """
    connect to our database and return the object
    """
    try:
        dbcon = sqlite3.connect(dbname);
    except:
        e = sys.exc_info()[0];
        exit(e);
    return dbcon; 

def dbclose(db):
    """
    close database
    """
    db.close();

def dbgetanimeids(db):
    ids = [];
    c = db.cursor();
    for id in c.execute("SELECT animeid FROM anime;"):
        ids.append(id[0]);
    return (ids);

def createlinks(ids):
    links = [];
    for anime in ids:
        links.append("https://myanimelist.net/anime/{}".format(anime));
    return links;

def scrapelinks(animelinks):
    """
    scrape MAL link for the anime cover links
    regex on html, what could go wrong?
    """
    links = [];
    for link in animelinks:
        src = urllib.request.urlopen(link).read().decode("utf-8");
        for l in re.findall('https:\/\/myanimelist\.cdn-dena\.com\/images\/anime\/[0-9]*\/[0-9]*.jpg', src):
            links.append(l);
            break;
    return links;

def getcoverimage(link, animeid):
    print("downloading {}".format(link));
    urllib.request.urlretrieve(link, "covers/{}.jpg".format(animeid))

if __name__ == "__main__":
    """
    retrieve anime id's from the database
    construct urls to scrape
    scrape url for cover image link
    """

    parser = argparse.ArgumentParser();
    parser.add_argument("-d", "--database", type=str, action="store", dest="dbfile",
            default="../userdb.db", required=True,
            help="sqlite3 database file containing anime information");
    args = parser.parse_args();

    # if our database (file) doesn't exist, exit
    if not os.path.isfile(args.dbfile):
        exit("file doesn't exist {}".format(args.dbfile));

    db = dbconnect(args.dbfile);

    # collect anmie ids
    animeids = dbgetanimeids(db);
    # create MAL anime links
    animelinks = createlinks(animeids);

    # scrape links for the cover image link
    print("scraping MAL pages... this may take some time...");
    coverlinks = scrapelinks(animelinks);

    # download cover images
    x = 0;
    for link in coverlinks:
        getcoverimage(link, animeids[x]);
        x += 1;

    dbclose(db);