BeautifulSoup find extract info to cusom-made list-CodePudding

I have 3 different type ofhtml snippets which are part of a bigger part as follows:

<html>
 <body>
  <span _ngcontent-dna-c199="" >
   <span _ngcontent-dna-c199="" >
    <span _ngcontent-dna-c199="" translate="">
     issue_number
    </span>
    4 Näköispainos
   </span>
   <span _ngcontent-dna-c199="" >
    6.12.1939
   </span>
  </span>
 </body>
</html>

and

<html>
 <body>
  <span _ngcontent-sut-c199="" >
   <span _ngcontent-sut-c199="" >
    <span _ngcontent-sut-c199="" translate="">
     issue_number
    </span>
    8
   </span>
   <span _ngcontent-sut-c199="" >
    1998
   </span>
  </span>
 </body>
</html>

and

<html>
 <body>
  <span _ngcontent-dgu-c199="" >
   <span _ngcontent-dgu-c199="" >
    1905
   </span>
  </span>
 </body>
</html>

Given the following code:

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml") # html_1, html_2, html_3
res = soup.find("span", class_="font-weight-bold")
print(res.text.split())

I get the following results:

['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', '1998']                      # html_2
['1905']                                           # html_3

However, my desired custom-made list should have 4 elements and looks like this:

desired_list = ["issue_number", "number", "extension", "date"]

so if there is no info available in html snippet, I'd like to get None or simply "-" in that specific element of my desired custom list as follows:

['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', None, '1998']                # html_2
[None, None, None, '1905']                         # html_3

Is there anyway to manipulate the result list to obtain the desired list using soup.find()?

CodePudding user response：

Try it like below:

from bs4 import BeautifulSoup

pages = [
'''
<html>
 <body>
  <span _ngcontent-dna-c199="" >
   <span _ngcontent-dna-c199="" >
    <span _ngcontent-dna-c199="" translate="">
     issue_number
    </span>
    4 Näköispainos
   </span>
   <span _ngcontent-dna-c199="" >
    6.12.1939
   </span>
  </span>
 </body>
</html>
''',
'''
<html>
 <body>
  <span _ngcontent-sut-c199="" >
   <span _ngcontent-sut-c199="" >
    <span _ngcontent-sut-c199="" translate="">
     issue_number
    </span>
    8
   </span>
   <span _ngcontent-sut-c199="" >
    1998
   </span>
  </span>
 </body>
</html>
''',
'''
<html>
 <body>
  <span _ngcontent-dgu-c199="" >
   <span _ngcontent-dgu-c199="" >
    1905
   </span>
  </span>
 </body>
</html>
''' ]
i = -1
my_dct = {}
my_list = []
for page in pages:
    i  = 1
    soup = BeautifulSoup(page, "lxml") # html_1, html_2, html_3
    spans = soup.find_all("span", attrs= {"class": "ng-star-inserted"})
    sp_0 = spans[0].span
    if sp_0:
        txt = sp_0.text.replace('\n', '').strip()
        if txt not in my_list:
            my_list.append(txt)
    else:
        my_list.append('None')
        
    sp_1 = spans[0]
    if sp_1 and len(spans) > 1:
        txt_1 = sp_1.text.replace('\n', '').replace(txt, '').strip()
        my_list.append(txt_1.split()[0])
        if len(txt_1.split()) > 1:
            my_list.append(txt_1.split()[1])
        else:
            my_list.append('None')
    else:
        my_list.append('None')
        
    if len(spans) > 1:
        txt_2 = spans[1].text.replace('\n', '').strip()
        my_list.append(txt_2)
    else:
        my_list.append('None')
        txt_2 = spans[0].text.replace('\n', '').strip()
        my_list.append(txt_2)
        
    my_dct['html_'   str(i   1)] = my_list
    my_list = []
print(my_dct)

'''  R e s u l t :
{
  'html_1': ['issue_number', '4', 'Näköispainos', '6.12.1939'], 
  'html_2': ['issue_number', '8', 'None', '1998'], 
  'html_3': ['None', 'None', 'None', '1905']
}
'''

Regards...

CodePudding user response：

You could search for each part individually:

def bs2customList(soup): 
    fwb = soup.find("span", class_="font-weight-bold")

    fdesc = [] if fwb is None else [( #(text, filter1, filter2, filter3)
        d.get_text(strip=True), d.parent.get('class', []), d.parent.get('translate'), len(d.parent.select('span')) 
    ) for d in fwb.descendants if d.name is None and d.parent.name == 'span' and d.get_text(strip=True) ]

    filters = [([], '', 0), (['ng-star-inserted'], None, 0), (['ng-star-inserted'], None, 1)]
    issueNum, ydate, num_ext = [[d[0] for d in fdesc if d[1:] == f] for f in filters]  
    num = num_ext[0].split()[0] if num_ext else []
    ext = num_ext[0].split()[1:] if num_ext else []
  
    return [(d[0] if d else None) for d in [issueNum, num, ext, ydate]]

or maybe this is more understandable:

def bs2customList(soup): 
    fwb = soup.find("span", class_="font-weight-bold")
    if fwb is None or not fwb.select('span.ng-star-inserted'): 
        return [ None ]*4

    snsi = fwb.select('span.ng-star-inserted') 
    
    snsi1 = [t for t in snsi if t.select('span[translate]')]
    if snsi1 != []:
        issueNum = snsi1[0].select_one('span[translate]').get_text(strip=True)

        ext = [
            c.get_text(strip=True) for c in snsi1[0].children 
            if c.name is None and c.get_text(strip=True)
        ]
        if ext:
            ext = [e for e in ext[0].split() if e]

            # keep ext[0].isdigit() only if "number" is always integer
            if len(ext) > 1 and ext[0].isdigit(): 
                num = ext[0]
                ext = ' '.join(ext[1:])
            else: num, ext = ' '.join(ext), None
        else: num, ext = None, None
    else: issueNum, num, ext = [ None ]*3    

    ydate = [t for t in snsi if not t.select('span[translate]')]
    ydate = ydate[0].get_text(strip=True) if ydate else None

    return [issueNum, num, ext, ydate]

Whichever version of the function is used, with the below test set:

htmls = [
    '''
    <html>
      <body>
        <span _ngcontent-dna-c199="" >
          <span _ngcontent-dna-c199="" >
            <span _ngcontent-dna-c199="" translate="">
              issue_number
            </span>
            4 Näköispainos
          </span>
          <span _ngcontent-dna-c199="" >
            6.12.1939
          </span>
        </span>
      </body>
    </html>
    ''',
    '''
    <html>
      <body>
        <span _ngcontent-sut-c199="" >
          <span _ngcontent-sut-c199="" >
            <span _ngcontent-sut-c199="" translate="">
              issue_number
            </span>
            8
          </span>
          <span _ngcontent-sut-c199="" >
            1998
          </span>
        </span>
      </body>
    </html>
    ''',
    '''
    <html>
      <body>
        <span _ngcontent-dgu-c199="" >
          <span _ngcontent-dgu-c199="" >
            1905
          </span>
        </span>
      </body>
    </html>
    ''',
    '<html><body><span ></span></body></html>',
    '' # empty str
]

printing with

for h in htmls: print(bs2customList(BeautifulSoup(h, 'lxml')))

gives the same output [with both versions]:

['issue_number', '4', 'Näköispainos', '6.12.1939']
['issue_number', '8', None, '1998']
[None, None, None, '1905']
[None, None, None, None]
[None, None, None, None]

(The last 2 tests are with an empty [textless] html and an empty string.)