from mechanize import Browser
br = Browser()
br.open('http://somewebpage')
html = br.response().readlines()
for line in html:
  print line

当在HTML文件中打印一行时,我试图找到一种方法,只显示每个HTML元素的内容,而不是格式本身。如果它发现'<a href="等等。例如">some text</a>',它只会打印'some text', '<b>hello</b>'打印'hello',等等。该怎么做呢?


当前回答

这是一个快速修复,甚至可以更优化,但它将工作良好。这段代码将用""替换所有非空标记,并从给定的输入文本中剥离所有html标记。你可以使用./file.py输入输出运行它

    #!/usr/bin/python
import sys

def replace(strng,replaceText):
    rpl = 0
    while rpl > -1:
        rpl = strng.find(replaceText)
        if rpl != -1:
            strng = strng[0:rpl] + strng[rpl + len(replaceText):]
    return strng


lessThanPos = -1
count = 0
listOf = []

try:
    #write File
    writeto = open(sys.argv[2],'w')

    #read file and store it in list
    f = open(sys.argv[1],'r')
    for readLine in f.readlines():
        listOf.append(readLine)         
    f.close()

    #remove all tags  
    for line in listOf:
        count = 0;  
        lessThanPos = -1  
        lineTemp =  line

            for char in lineTemp:

            if char == "<":
                lessThanPos = count
            if char == ">":
                if lessThanPos > -1:
                    if line[lessThanPos:count + 1] != '<>':
                        lineTemp = replace(lineTemp,line[lessThanPos:count + 1])
                        lessThanPos = -1
            count = count + 1
        lineTemp = lineTemp.replace("&lt","<")
        lineTemp = lineTemp.replace("&gt",">")                  
        writeto.write(lineTemp)  
    writeto.close() 
    print "Write To --- >" , sys.argv[2]
except:
    print "Help: invalid arguments or exception"
    print "Usage : ",sys.argv[0]," inputfile outputfile"

其他回答

如果你需要保留HTML实体(即&),我在Eloff的答案中添加了“handle_entityref”方法。

from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_entityref(self, name):
        self.fed.append('&%s;' % name)
    def get_data(self):
        return ''.join(self.fed)

def html_to_text(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

如果你想去掉所有HTML标签,我发现最简单的方法是使用BeautifulSoup:

from bs4 import BeautifulSoup  # Or from BeautifulSoup import BeautifulSoup

def stripHtmlTags(htmlTxt):
    if htmlTxt is None:
            return None
        else:
            return ''.join(BeautifulSoup(htmlTxt).findAll(text=True)) 

我尝试了接受的答案的代码,但我得到了“RuntimeError:最大递归深度超出”,这没有发生在上面的代码块。

我需要一种方法来剥离标签和解码HTML实体为纯文本。下面的解决方案是基于Eloff的答案(我不能使用,因为它剥离实体)。

import html.parser

class HTMLTextExtractor(html.parser.HTMLParser):
    def __init__(self):
        super(HTMLTextExtractor, self).__init__()
        self.result = [ ]

    def handle_data(self, d):
        self.result.append(d)

    def get_text(self):
        return ''.join(self.result)

def html_to_text(html):
    """Converts HTML to plain text (stripping tags and converting entities).
    >>> html_to_text('<a href="#">Demo<!--...--> <em>(&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>')
    'Demo (\xac \u0394\u03b7\u03bc\u03ce)'

    "Plain text" doesn't mean result can safely be used as-is in HTML.
    >>> html_to_text('&lt;script&gt;alert("Hello");&lt;/script&gt;')
    '<script>alert("Hello");</script>'

    Always use html.escape to sanitize text before using in an HTML context!

    HTMLParser will do its best to make sense of invalid HTML.
    >>> html_to_text('x < y &lt z <!--b')
    'x < y < z '

    Named entities are handled as per HTML 5.
    >>> html_to_text('&nosuchentity; &apos; ')
    "&nosuchentity; ' "
    """
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()

快速测试:

html = '<a href="#">Demo <em>(&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>'
print(repr(html_to_text(html)))

结果:

'Demo (¬ Δημώ)'

安全提示:不要混淆HTML剥离(将HTML转换为纯文本)和HTML消毒(将纯文本转换为HTML)。这个答案将删除HTML并将实体解码为纯文本——这并不能使结果在HTML上下文中安全使用。

例子:& lt; script&gt警报(“Hello”);& lt; / script&gt;将转换为<script>alert("Hello");</script>,这是100%正确的行为,但如果结果纯文本按原样插入到HTML页面中,显然是不够的。

这个规则并不难:任何时候你插入一个纯文本字符串到HTML输出,总是HTML转义它(使用HTML .escape(s)),即使你“知道”它不包含HTML(例如,因为你剥离了HTML内容)。

但是,OP询问是否将结果打印到控制台,在这种情况下不需要HTML转义。相反,你可能想要剥离ASCII控制字符,因为它们会触发不需要的行为(特别是在Unix系统上):

import re
text = html_to_text(untrusted_html_input)
clean_text = re.sub(r'[\0-\x1f\x7f]+', '', text)
# Alternatively, if you want to allow newlines:
# clean_text = re.sub(r'[\0-\x09\x0b-\x1f\x7f]+', '', text)
print(clean_text)

我正在解析Github自述,我发现下面的工作真的很好:

import re
import lxml.html

def strip_markdown(x):
    links_sub = re.sub(r'\[(.+)\]\([^\)]+\)', r'\1', x)
    bold_sub = re.sub(r'\*\*([^*]+)\*\*', r'\1', links_sub)
    emph_sub = re.sub(r'\*([^*]+)\*', r'\1', bold_sub)
    return emph_sub

def strip_html(x):
    return lxml.html.fromstring(x).text_content() if x else ''

然后

readme = """<img src="https://raw.githubusercontent.com/kootenpv/sky/master/resources/skylogo.png" />

            sky is a web scraping framework, implemented with the latest python versions in mind (3.4+). 
            It uses the asynchronous `asyncio` framework, as well as many popular modules 
            and extensions.

            Most importantly, it aims for **next generation** web crawling where machine intelligence 
            is used to speed up the development/maintainance/reliability of crawling.

            It mainly does this by considering the user to be interested in content 
            from *domains*, not just a collection of *single pages*
            ([templating approach](#templating-approach))."""

strip_markdown(strip_html(readme))

正确移除所有markdown和html。

这是一个类似于目前接受的答案(https://stackoverflow.com/a/925630/95989)的解决方案,除了它直接使用内部HTMLParser类(即没有子类化),从而使它显着更简洁:

def strip_html(text):
    parts = []                                                                      
    parser = HTMLParser()                                                           
    parser.handle_data = parts.append                                               
    parser.feed(text)                                                               
    return ''.join(parts)