Why won't my regular expressions match the following string?-CodePudding

I am trying to extract the class abbreviation (Econ 114) and name (Adv Quant Methods) from strings similar to ECON 114 - 01   Adv Quant Methods in python.

I am using the expression r'(?i)(\w \s\w ) \s-\s\w [ ] ([\w\s] \b)' which works in my regex tester. However, when I run this in scrapy the return array is empty. What am I doing wrong? (code below)

import scrapy;
import re as pythonRe;

#with open('../econ.html', 'r') as f:
    #html_string = f.read()

econ_headers = {'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Origin': 'https://pisa.ucsc.edu',
    'Accept-Language': 'en-us',
    'Host': 'pisa.ucsc.edu',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Referer': 'https://pisa.usc.edu/class_search/',
    'Accept-Encoding': ['gzip', 'deflate', 'br'],
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded'}

class ClassesSpider(scrapy.Spider):
    name = "classes"


    def start_requests(self):
   
        urls = [
            'https://pisa.usc.edu/class_search/index.php'
            ]
        for url in urls:
            yield scrapy.Request(url=url, method="POST", headers=econ_headers, body='action=results&binds[:term]=2228&binds[:reg_status]=all&binds[:subject]=ECON&binds[:catalog_nbr_op]==&binds[:catalog_nbr]=&binds[:title]=&binds[:instr_name_op]==&binds[:instructor]=&binds[:ge]=&binds[:crse_units_op]==&binds[:crse_units_from]=&binds[:crse_units_to]=&binds[:crse_units_exact]=&binds[:days]=&binds[:times]=&binds[:acad_career]=&binds[:asynch]=A&binds[:hybrid]=H&binds[:synch]=S&binds[:person]=P', callback=self.parse)

    def parse(self, response):
        def professor_filter(item):
          if (pythonRe.search(r'\w\.', item) or "Staff" in item):
            return True

        #class_regex = pythonRe.compile(r'(?i)(\w \s\w ) \s-\s\w [&nbsp;] ([\w\s] \b)')
        page = response.url.split("/")[-2]
        classDict = {}
        classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w \s\w ) \s-\s\w [&nbsp;] ([\w\s] \b)')
        professors = response.xpath('//div[contains(@class, "col-xs-6 col-sm-3")]/text()').getall()

        professors_filtered = list(filter(professor_filter, professors))

        #for x in range((len(classes))):
          #classDict[classes[x]] = {'professor': professors_filtered[x]}

        print(classes)
        print(len(classes))
        print(professors_filtered)
        print(len(professors_filtered))
        print(professors)
        print(classDict)
        
        filename = f'class-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log(f'Saved file {filename}')

CodePudding user response：

If you would first get full text for classes and display it
then you would see that scrapy gives \xa0 instead of  

And you have to use \xa0 instead of [ ]

classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w \s\w ) \s-\s\w [\xa0] ([\w\s] \b)')

and this gives me:

classes: ['ECON 1', 'Intro Microeconomic', 'ECON 1', 'Intro Microeconomic', 'ECON 2', 'Intro Macroeconomic', 'ECON 10A', 'Econ of Accounting', 'ECON 10A', 'Econ of Accounting', 'ECON 11A', 'Math Methd for Econ', 'ECON 11B', 'Math Methds Econ II', 'ECON 100A', 'Intermed Microecon', 'ECON 100A', 'Intermed Microecon', 'ECON 100B', 'Intermed Macroecon', 'ECON 101', 'Managerial Econ', 'ECON 104', 'Numbr Truth', 'ECON 111A', 'Intermed Account I', 'ECON 113', 'Intro Econometrics', 'ECON 113', 'Intro Econometrics', 'ECON 114', 'Adv Quant Methods', 'ECON 117B', 'Tax Factors', 'ECON 125', 'Econ History Of US', 'ECON 126', 'Why Succeed', 'ECON 133', 'Security Markets', 'ECON 136', 'Business Strategy', 'ECON 141', 'Internatl Finance', 'ECON 150', 'Public Finance', 'ECON 161A', 'Marketing', 'ECON 166A', 'Game Theory']

I think problem is because response.body gives original string with HTML but other functions may have to convert this string to HTML Tree (like in modules lxml or BeautifulSoup) and it may automatically convert html entities (like  ) to chars.

As I know scrapy uses parsel to select elements in HTML.

See Scrapy doc: Selectors

EDIT:

Full working code with other changes

I use FormRequest
first I search rows in table and later search class and professor in every row separatelly.

import scrapy
import re

econ_headers = {
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Origin': 'https://pisa.ucsc.edu',
    'Accept-Language': 'en-us',
    'Host': 'pisa.ucsc.edu',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Referer': 'https://pisa.usc.edu/class_search/',
    'Accept-Encoding': ['gzip', 'deflate', 'br'],
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
}

data = {
    'action': 'results',
    'binds[:term]': '2228',
    'binds[:reg_status]': 'all',
    'binds[:subject]': 'ECON',
    'binds[:catalog_nbr_op]': '=',
    'binds[:catalog_nbr]': '',
    'binds[:title]': '',
    'binds[:instr_name_op]': '=',
    'binds[:instructor]': '',
    'binds[:ge]': '',
    'binds[:crse_units_op]': '=',
    'binds[:crse_units_from]': '',
    'binds[:crse_units_to]': '',
    'binds[:crse_units_exact]': '',
    'binds[:days]': '',
    'binds[:times]': '',
    'binds[:acad_career]': '',
    'binds[:asynch]': 'A',
    'binds[:hybrid]': 'H',
    'binds[:synch]': 'S',
    'binds[:person]': 'P',
}

def professor_filter(item):
    return (re.search(r'\w\.', item) or "Staff" in item)

class ClassesSpider(scrapy.Spider):
    
    name = "classes"

    def start_requests(self):
        urls = ['https://pisa.ucsc.edu/class_search/index.php']
        for url in urls:
            #yield scrapy.Request(url,
            #                     headers=econ_headers,
            #                     body='action=results&binds[:term]=2228&binds[:reg_status]=all&binds[:subject]=ECON&binds[:catalog_nbr_op]==&binds[:catalog_nbr]=&binds[:title]=&binds[:instr_name_op]==&binds[:instructor]=&binds[:ge]=&binds[:crse_units_op]==&binds[:crse_units_from]=&binds[:crse_units_to]=&binds[:crse_units_exact]=&binds[:days]=&binds[:times]=&binds[:acad_career]=&binds[:asynch]=A&binds[:hybrid]=H&binds[:synch]=S&binds[:person]=P',
            #                     callback=self.parse)

            yield scrapy.FormRequest(url,
                                 headers=econ_headers,
                                 formdata=data,
                                 callback=self.parse)

    def parse(self, response):

        page = response.url.split("/")[-2]

        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')

        classDict = {}
        
        for row in all_rows:
            classname = row.xpath('.//h2//a/text()').re(r'(?i)(\w \s\w ) \s-\s\w \xa0 ([\w\s] \b)')
            professor = row.xpath('(.//div[@]//div)[3]/text()').get().strip()
            print(classname, professor)
            if professor and professor_filter(professor):
                classDict[tuple(classname)] = [professor]
                yield {'class': tuple(classname), 'professor': professor}  # it will write to file csv
            else:
                print('skip:', professor)
        print(classDict)
        
        #filename = f'class-{page}.html'
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
        #self.log(f'Saved file {filename}')

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    #'USER_AGENT': 'Mozilla/5.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ClassesSpider)
c.start()