from mechanize import Browser
br = Browser()
br.open('http://somewebpage')
html = br.response().readlines()
for line in html:
  print line

当在HTML文件中打印一行时,我试图找到一种方法,只显示每个HTML元素的内容,而不是格式本身。如果它发现'<a href="等等。例如">some text</a>',它只会打印'some text', '<b>hello</b>'打印'hello',等等。该怎么做呢?


当前回答

这个方法对我来说完美无缺,不需要额外的安装:

import re
import htmlentitydefs

def convertentity(m):
    if m.group(1)=='#':
        try:
            return unichr(int(m.group(2)))
        except ValueError:
            return '&#%s;' % m.group(2)
        try:
            return htmlentitydefs.entitydefs[m.group(2)]
        except KeyError:
            return '&%s;' % m.group(2)

def converthtml(s):
    return re.sub(r'&(#?)(.+?);',convertentity,s)

html =  converthtml(html)
html.replace("&nbsp;", " ") ## Get rid of the remnants of certain formatting(subscript,superscript,etc).

其他回答

import re

def remove(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

我总是使用这个函数来剥离HTML标签,因为它只需要Python标准库:

对于Python 3:

from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

对于Python 2:

from HTMLParser import HTMLParser
from StringIO import StringIO

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

我就是这么做的,但我不知道我在做什么。我通过剥离HTML标记从HTML表中获取数据。

它接受字符串“name”并返回不带HTML标记的字符串“name1”。

x = 0
anglebrackets = 0
name1 = ""
while x < len(name):
    
    if name[x] == "<":
        anglebrackets = anglebrackets + 1
    if name[x] == ">":
        anglebrackets = anglebrackets - 1
    if anglebrackets == 0:
        if name[x] != ">":
            name1 = name1 + name[x]
    x = x + 1

我正在解析Github自述,我发现下面的工作真的很好:

import re
import lxml.html

def strip_markdown(x):
    links_sub = re.sub(r'\[(.+)\]\([^\)]+\)', r'\1', x)
    bold_sub = re.sub(r'\*\*([^*]+)\*\*', r'\1', links_sub)
    emph_sub = re.sub(r'\*([^*]+)\*', r'\1', bold_sub)
    return emph_sub

def strip_html(x):
    return lxml.html.fromstring(x).text_content() if x else ''

然后

readme = """<img src="https://raw.githubusercontent.com/kootenpv/sky/master/resources/skylogo.png" />

            sky is a web scraping framework, implemented with the latest python versions in mind (3.4+). 
            It uses the asynchronous `asyncio` framework, as well as many popular modules 
            and extensions.

            Most importantly, it aims for **next generation** web crawling where machine intelligence 
            is used to speed up the development/maintainance/reliability of crawling.

            It mainly does this by considering the user to be interested in content 
            from *domains*, not just a collection of *single pages*
            ([templating approach](#templating-approach))."""

strip_markdown(strip_html(readme))

正确移除所有markdown和html。

你可以使用BeautifulSoup get_text()特性。

from bs4 import BeautifulSoup

html_str = '''
<td><a href="http://www.fakewebsite.example">Please can you strip me?</a>
<br/><a href="http://www.fakewebsite.example">I am waiting....</a>
</td>
'''
soup = BeautifulSoup(html_str)

print(soup.get_text())
#or via attribute of Soup Object: print(soup.text)

建议显式地指定解析器,例如BeautifulSoup(html_str, features="html.parser"),以便输出可重现。