pkg/htmltext/htmltext.go (144 lines of code) (raw):
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package htmltext
import (
"io"
"net/http"
"net/url"
"regexp"
"strings"
"unicode/utf8"
"github.com/Machiel/slugify"
"github.com/apache/answer/pkg/checker"
"github.com/apache/answer/pkg/converter"
strip "github.com/grokify/html-strip-tags-go"
"github.com/mozillazg/go-pinyin"
)
// ClearText clear HTML, get the clear text
func ClearText(html string) (text string) {
if len(html) == 0 {
text = html
return
}
var (
re *regexp.Regexp
codeReg = `(?ism)<(pre)>.*<\/pre>`
codeRepl = "{code...}"
linkReg = `(?ism)<a.*?[^<]>(.*)?<\/a>`
linkRepl = " [$1] "
spaceReg = ` +`
spaceRepl = " "
)
re = regexp.MustCompile(codeReg)
html = re.ReplaceAllString(html, codeRepl)
re = regexp.MustCompile(linkReg)
html = re.ReplaceAllString(html, linkRepl)
text = strings.NewReplacer(
"\n", " ",
"\r", " ",
"\t", " ",
).Replace(strip.StripTags(html))
// replace multiple spaces to one space
re = regexp.MustCompile(spaceReg)
text = strings.TrimSpace(re.ReplaceAllString(text, spaceRepl))
return
}
func UrlTitle(title string) (text string) {
title = convertChinese(title)
title = clearEmoji(title)
title = slugify.Slugify(title)
title = url.QueryEscape(title)
title = cutLongTitle(title)
if len(title) == 0 {
title = "topic"
}
return title
}
func clearEmoji(s string) string {
ret := ""
rs := []rune(s)
for i := 0; i < len(rs); i++ {
if len(string(rs[i])) != 4 {
ret += string(rs[i])
}
}
return ret
}
func convertChinese(content string) string {
has := checker.IsChinese(content)
if !has {
return content
}
return strings.Join(pinyin.LazyConvert(content, nil), "-")
}
func cutLongTitle(title string) string {
if len(title) > 150 {
return title[0:150]
}
return title
}
// FetchExcerpt return the excerpt from the HTML string
func FetchExcerpt(html, trimMarker string, limit int) (text string) {
return FetchRangedExcerpt(html, trimMarker, 0, limit)
}
// findFirstMatchedWord returns the first matched word and its index
func findFirstMatchedWord(text string, words []string) (string, int) {
if len(text) == 0 || len(words) == 0 {
return "", 0
}
words = converter.UniqueArray(words)
firstWord := ""
firstIndex := len(text)
for _, word := range words {
if idx := strings.Index(text, word); idx != -1 && idx < firstIndex {
firstIndex = idx
firstWord = word
}
}
if firstIndex != len(text) {
return firstWord, firstIndex
}
return "", 0
}
// getRuneRange returns the valid begin and end indexes of the runeText
func getRuneRange(runeText []rune, offset, limit int) (begin, end int) {
runeLen := len(runeText)
limit = min(runeLen, max(0, limit))
begin = min(runeLen, max(0, offset))
end = min(runeLen, begin+limit)
return
}
// FetchRangedExcerpt returns a ranged excerpt from the HTML string.
// Note: offset is a rune index, not a byte index
func FetchRangedExcerpt(html, trimMarker string, offset int, limit int) (text string) {
if len(html) == 0 {
text = html
return
}
runeText := []rune(ClearText(html))
begin, end := getRuneRange(runeText, offset, limit)
text = string(runeText[begin:end])
if begin > 0 {
text = trimMarker + text
}
if end < len(runeText) {
text = text + trimMarker
}
return
}
// FetchMatchedExcerpt returns the matched excerpt according to the words
func FetchMatchedExcerpt(html string, words []string, trimMarker string, trimLength int) string {
text := ClearText(html)
matchedWord, matchedIndex := findFirstMatchedWord(text, words)
runeIndex := utf8.RuneCountInString(text[0:matchedIndex])
trimLength = max(0, trimLength)
runeOffset := runeIndex - trimLength
runeLimit := trimLength + trimLength + utf8.RuneCountInString(matchedWord)
textRuneCount := utf8.RuneCountInString(text)
if runeOffset+runeLimit > textRuneCount {
// Reserved extra chars before the matched word
runeOffset = textRuneCount - runeLimit
}
return FetchRangedExcerpt(html, trimMarker, runeOffset, runeLimit)
}
func GetPicByUrl(Url string) string {
res, err := http.Get(Url)
if err != nil {
return ""
}
defer res.Body.Close()
pix, err := io.ReadAll(res.Body)
if err != nil {
return ""
}
return string(pix)
}