Feature request for crawl_images function #384

caroheymesitf · 2024-10-09T16:35:50Z

Hello Elias,

Is there any possibility to add the integration with playright in the crawl_images function ?

Furthermore, I would love to be able to select specific images by their xpath !

Cheers !

eliasdabbas · 2024-10-15T07:44:00Z

Thanks @caroheymesitf

For now you can select using a regex for matching img src URL. This is not exactly XPath but can help in many cases.

What do you want the playwright integration for? Another use case?

Thanks!

caroheymesitf · 2024-10-15T09:16:43Z

Hi Elias,

Great to have a feed-back !

Ideally, I need the Playwright integration in the crawl_images function. Some websites, heavily reliant on JavaScript, do not allow images to load completely through Advertools. Perhaps scrapy-playright can be a fix.

I wasn't able to fix the Playwright integration, and since the function is labeled "THIS FUNCTION IS STILL EXPERIMENTAL. Expect many changes," I was wondering if it would be possible to add XPath selection as well?

Thanks a million !

eliasdabbas · 2024-10-16T09:31:31Z

JS: This is going to depend on how each website does it, and will need a deeper look, because I don't think there is one way that can easily be tackled. I could be wrong though.

XPath: It's not clear to me how to implement that yet. I quickly checked scrapy docs and source for the image pipeline, not sure how to implement that. I'll let you know if/when I do.
In any case you can use min_width, min_height, and include_img_regex to restrict images, and then delete the one you don't want if these still don't work.

caroheymes · 2024-10-17T11:37:30Z

Hi Elias,
Regarding xpath, I suggest

#-----------------creates module image_spider.py

import json
import re
import subprocess
from urllib.parse import urlsplit
import runpy

import pandas as pd
from scrapy import Field, Item, Request, Spider
from scrapy.pipelines.images import ImagesPipeline
import os
import requests
import random

ua = requests.get('https://raw.githubusercontent.com/tamimibrahim17/List-of-user-agents/master/Chrome.txt').content
ua = str(ua).split('\\n')
user_agents = ua[3:len(ua)-1]
user_agent = random.choice(user_agents)

class ImgItem(Item):
    image_urls = Field()
    images = Field()
    image_location = Field()

class AdvImagesPipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        img_url = request.url
        image_name = urlsplit(img_url).path.split("/")[-1]
        
        # Gets primfix image name from spider (info.spider)
        prefix = getattr(info.spider, 'prefix', '')
        if prefix:
            image_name = prefix + '_'+ image_name
        return image_name

class ImageSpider(Spider):
    name = "image_spider"
    include_img_regex = None
    custom_settings = {
        "USER_AGENT": user_agent,
        "ROBOTSTXT_OBEY": False,
        "HTTPERROR_ALLOW_ALL": True,
        "ITEM_PIPELINES": {AdvImagesPipeline: 1},
        "AUTOTHROTTLE_ENABLED": True,
        "AUTOTHROTTLE_TARGET_CONCURRENCY": 8,
         'LOG_LEVEL' : 'CRITICAL',
    }

    def __init__(self, start_urls, include_img_regex=None,xpath = None, prefix = '',*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = json.loads(json.dumps(start_urls.split(",")))
        if include_img_regex is not None:
            self.include_img_regex = include_img_regex
        self.xpath = xpath
        self.prefix = prefix


    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=self.parse)

    def parse(self, response):
        img_item = ImgItem()
        if self.xpath is not None:
            img_src = response.xpath(self.xpath).getall()
        else:
          img_src = response.xpath("//img/@src").getall()
        if self.include_img_regex is not None:
            img_src = [
                response.urljoin(src)
                for src in img_src
                if re.findall(self.include_img_regex, src)
            ]
        else:
            img_src = [response.urljoin(src) for src in img_src]
        img_item["image_urls"] = img_src
        img_item["image_location"] = response.request.url
        yield img_item```

#---------end of script

#------  executes script
import subprocess
import os
import json
import shutil
import pandas as pd

def setup_image_directory(directory_path):
    """creates image folder"""
    if os.path.exists(directory_path):
        shutil.rmtree(directory_path)
    os.mkdir(directory_path)

def build_settings_list(custom_settings):
    """custom_settings"""
    settings_list = []
    if custom_settings:
        for key, val in custom_settings.items():
            setting = "=".join([key, json.dumps(val)]) if isinstance(val, dict) else "=".join([key, str(val)])
            settings_list.extend(["-s", setting])
    return settings_list

def build_scrapy_command(start_urls, output_dir, min_width, min_height, settings_list, include_img_regex, xpath, prefix):
    command = [
        "scrapy",
        "runspider",
        image_spider_path,
        "-a", "start_urls=" + ",".join(start_urls),
        "-s", "IMAGES_STORE=" + output_dir,
        "-s", "IMAGES_MIN_HEIGHT=" + str(min_height),
        "-s", "IMAGES_MIN_WIDTH=" + str(min_width),
        "-o", output_dir + "/image_summary.jl"
    ] + settings_list
    if include_img_regex:
        command += ["-a", "include_img_regex=" + include_img_regex]
    if xpath:
        command += ["-a", "xpath=" + xpath]
    if prefix:
        command += ["-a", "prefix=" + prefix] 
    return command

def crawl_images(start_urls, output_dir, min_width=0, min_height=0, include_img_regex=None, custom_settings=None, xpath=None, prefix=''):
    """extracts images"""
    settings_list = build_settings_list(custom_settings)
    command = build_scrapy_command(start_urls, output_dir, min_width, min_height, settings_list, include_img_regex, xpath, prefix)
    subprocess.run(command)

def summarize_crawled_imgs(image_dir):
    """images sources """
    df = pd.read_json(image_dir.rstrip("/") + "/image_summary.jl", lines=True)
    return df[["image_location", "image_urls"]].explode("image_urls")
    
#--------------------Launch functions
image_spider_path = "image_spider.py"
image_directory = './product_images'

setup_image_directory(image_directory)

# Launch crawler
crawl_images(
    start_urls=[
        'https://www.thecolvinco.com/es/ramos-de-flores/seleccion-flores-oto%C3%B1o-protea',
        'https://www.thecolvinco.com/es/comprar-planta-online/orquidea-blanca'
    ],
    xpath='//*[@id="__next"]/div/div/div[3]/div[2]/div/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/div/div[1]/div/div/div[1]/div/img/@src',
    output_dir=image_directory,
    prefix='mycustomprefix'
)

eliasdabbas · 2024-10-27T09:10:59Z

@caroheymesitf Thanks for the code!

I think this:

if self.xpath is not None:
    img_src = response.xpath(self.xpath).getall()
else:
  img_src = response.xpath("//img/@src").getall()

should be modified to

response.xpath(f'{USER_SUPPLIED_XPATH}//*//img/@src').getall()

Get the img src within the selected XPath in the page.

Otherwise I think it's doable. Just a few naming conventions for consistency with other functions xpath_selectors and it takes a dictionary.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Feature request for crawl_images function #384

Feature request for crawl_images function #384

caroheymesitf commented Oct 9, 2024

eliasdabbas commented Oct 15, 2024

caroheymesitf commented Oct 15, 2024

eliasdabbas commented Oct 16, 2024

caroheymes commented Oct 17, 2024

eliasdabbas commented Oct 27, 2024

Feature request for crawl_images function #384

Feature request for crawl_images function #384

Comments

caroheymesitf commented Oct 9, 2024

eliasdabbas commented Oct 15, 2024

caroheymesitf commented Oct 15, 2024

eliasdabbas commented Oct 16, 2024

caroheymes commented Oct 17, 2024

eliasdabbas commented Oct 27, 2024