htmlquery.go 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. package domquery
  2. import (
  3. "fmt"
  4. "strings"
  5. )
  6. /* End Node Functions */
  7. func LoadHTML(htmlStr string) *Node {
  8. tokList := getTokenList(htmlStr)
  9. bt := buildTree(tokList)
  10. return bt
  11. }
  12. func getTokenList(htmlStr string) []string {
  13. var tokList []string
  14. var tok string
  15. var isPhp bool
  16. for i := 0; i < len(htmlStr); i++ {
  17. chr := string(htmlStr[i])
  18. if strings.Contains(tok, "</style") {
  19. tok = strings.Replace(tok, "</style", "", 1)
  20. tokList = append(tokList, tok)
  21. tok = "</style>"
  22. tokList = append(tokList, tok)
  23. tok = ""
  24. continue
  25. }
  26. if len(tokList) > 0 {
  27. if strings.Contains(tokList[len(tokList)-1], "<style") {
  28. tok += chr
  29. continue
  30. }
  31. }
  32. if strings.Contains(tok, "</script") {
  33. tok = strings.Replace(tok, "</script", "", 1)
  34. tokList = append(tokList, tok)
  35. tok = "</script>"
  36. tokList = append(tokList, tok)
  37. tok = ""
  38. continue
  39. }
  40. if len(tokList) > 0 {
  41. if strings.Contains(tokList[len(tokList)-1], "<script") {
  42. tok += chr
  43. continue
  44. }
  45. }
  46. if chr == "<" {
  47. if string(htmlStr[i+1]) == "?" {
  48. isPhp = true
  49. tok += chr
  50. continue
  51. }
  52. if isPhp == true {
  53. tok += chr
  54. continue
  55. }
  56. tokList = append(tokList, tok)
  57. tok = chr
  58. continue
  59. }
  60. if chr == ">" {
  61. if string(htmlStr[i-1]) == "?" {
  62. isPhp = false
  63. tok += chr
  64. continue
  65. }
  66. if isPhp == true {
  67. tok += chr
  68. continue
  69. }
  70. tok += chr
  71. tokList = append(tokList, tok)
  72. tok = ""
  73. continue
  74. }
  75. tok += chr
  76. }
  77. tokList = append(tokList, tok)
  78. return tokList
  79. }
  80. func buildTree(tokList []string) *Node {
  81. root := &Node{}
  82. node := &Node{}
  83. node.Parent = root
  84. for _, tok := range tokList {
  85. prev := node
  86. prev.token = tok
  87. node = &Node{}
  88. if getTokType(tok) == "open" {
  89. node.Parent = prev
  90. }
  91. if getTokType(tok) == "text" {
  92. node.Parent = prev.Parent
  93. }
  94. if getTokType(tok) == "selfclosing" {
  95. node.Parent = prev.Parent
  96. }
  97. if getTokType(tok) == "comment" {
  98. node.Parent = prev.Parent
  99. }
  100. if getTokType(tok) == "close" {
  101. prev.Parent = prev.Parent.Parent
  102. node.Parent = prev.Parent
  103. }
  104. prev.Parent.Children = append(prev.Parent.Children, prev)
  105. }
  106. return root
  107. }
  108. func getTokType(tok string) string {
  109. if len(tok) < 2 {
  110. return "text"
  111. }
  112. fc := string(tok[0])
  113. sc := string(tok[1])
  114. if sc == "/" {
  115. return "close"
  116. }
  117. if fc == "<" && sc == "!" {
  118. return "comment"
  119. }
  120. if fc == "<" && sc == "?" {
  121. return "text"
  122. }
  123. if fc == "<" {
  124. if isSelfClosing(tok) {
  125. return "selfclosing"
  126. }
  127. return "open"
  128. }
  129. return "text"
  130. }
  131. func isSelfClosing(tok string) bool {
  132. tags := map[string]bool{
  133. "area": true,
  134. "base": true,
  135. "br": true,
  136. "col": true,
  137. "embed": true,
  138. "hr": true,
  139. "img": true,
  140. "input": true,
  141. "link": true,
  142. "meta": true,
  143. "param": true,
  144. "source": true,
  145. "track": true,
  146. "wbr": true,
  147. }
  148. return tags[getTagName(tok)]
  149. }
  150. func getTagName(tok string) string {
  151. tName := ""
  152. for i := 1; i < len(tok); i++ {
  153. chr := string(tok[i])
  154. if chr == " " || chr == ">" {
  155. break
  156. }
  157. tName += chr
  158. }
  159. return tName
  160. }
  161. func getCloseNode(node *Node) *Node {
  162. if getTokType(node.token) != "open" {
  163. return &Node{}
  164. }
  165. idx := 0
  166. for i, child := range node.Parent.Children {
  167. if child == node {
  168. idx = i
  169. break
  170. }
  171. }
  172. idx = idx + 1
  173. if idx > len(node.Parent.Children)-1 {
  174. fmt.Println("Parse Error: Unclosed tag in " + node.token)
  175. idx--
  176. }
  177. return node.Parent.Children[idx]
  178. }
  179. func matchSelector(node *Node, sel string) bool {
  180. if getTokType(node.token) == "close" {
  181. return false
  182. }
  183. if getTagName(node.token) == sel {
  184. return true
  185. }
  186. if "#"+node.GetAttribute("id") == sel {
  187. return true
  188. }
  189. for _, class := range node.ClassList() {
  190. if "."+class == sel {
  191. return true
  192. }
  193. }
  194. return false
  195. }