为了账号安全,请及时绑定邮箱和手机立即绑定

正则表达式获取 <script> 标签

正则表达式获取 <script> 标签

慕雪6442864 2023-07-14 16:34:30
我试图定位脚本内具有“ "@type": "NewsArticle" " 的整个脚本标记。像这样的东西:<script type="application\/ld\+json">[^\{]*?{(.*?)\}[^\}]*?<\/script>我可以使用上面的正则表达式来定位最上面的脚本标签。但我正在寻找 newsArticle JSON 信息,在本例中是第二个,但在某些页面中有 4 个以上 application/ld+json 标签,但 " "@type": "NewsArticle" "始终存在无论如何,在每一页中。所以我正在寻找一个可以针对该特定脚本的脚本。感谢帮助。<script type="application/ld+json">{    "@context": "http://schema.org",    "@type": "Organization",    "@id": "https://www.givemesport.com/#gms",    "name": "GiveMeSport",    "url": "https://www.givemesport.com",    "logo": {        "@type": "ImageObject",        "url": "https://gmsrp.cachefly.net/v4/images/logo-gms-black.png"    },    "sameAs":[        "https://www.facebook.com/GiveMeSport",        "https://www.instagram.com/givemesport",        "https://twitter.com/GiveMeSport",        "https://www.youtube.com/user/GiveMeSport"    ]}</script>    <script type="application/ld+json">    {    "@context": "http://schema.org",    "@type": "NewsArticle",    "mainEntityOfPage": "https://www.givemesport.com/1612447-man-uniteds-scott-mctominay-delighted-fans-with-reaction-after-third-goal-vs-rb-leipzig",    "url": "https://www.givemesport.com/1612447-man-uniteds-scott-mctominay-delighted-fans-with-reaction-after-third-goal-vs-rb-leipzig",    "headline": "Man United's Scott McTominay delighted fans with reaction after third goal vs RB Leipzig",    "datePublished": "2020-10-30T21:52:48.3510000Z",    "dateModified": "2020-10-30T21:52:48.3510000Z",    "description": "Man United's Scott McTominay delighted fans with reaction after third goal vs RB Leipzig",    "articleSection": "Football",    "keywords": ["Football","Manchester United","Marcus Rashford","RB Leipzig","Scott McTominay","UEFA Champions"],    "creator": ["Scott Wilson"],    "thumbnailUrl": "https://gmsrp.cachefly.net/images/20/10/30/03a426c8204af5c8d02282afaeed6189/144.jpg",    "author": {    "@type": "Person",    "name": "Scott Wilson",    "sameAs": "https://www.givemesport.com/scott-wilson-1"    },
查看完整描述

1 回答

?
森栏

TA贡献1810条经验 获得超5个赞

很遗憾得知您不想遵循最佳实践,使用正则表达式解析 HTML 充满了问题。但是,如果您想要快速而肮脏的解决方法,请使用

<script type="application\/ld\+json">((?:(?!<\/?script)[\w\W])*?"@type":\s*"NewsArticle"[\w\W]*?)<\/script>

解释

--------------------------------------------------------------------------------

  <script                  '<script type="application'

  type="application

--------------------------------------------------------------------------------

  \/                       '/'

--------------------------------------------------------------------------------

  ld                       'ld'

--------------------------------------------------------------------------------

  \+                       '+'

--------------------------------------------------------------------------------

  json">                   'json">'

--------------------------------------------------------------------------------

  (                        group and capture to \1:

--------------------------------------------------------------------------------

    (?:                      group, but do not capture (0 or more

                             times (matching the least amount

                             possible)):

--------------------------------------------------------------------------------

      (?!                      look ahead to see if there is not:

--------------------------------------------------------------------------------

        <                        '<'

--------------------------------------------------------------------------------

        \/?                      '/' (optional (matching the most

                                 amount possible))

--------------------------------------------------------------------------------

        script                   'script'

--------------------------------------------------------------------------------

      )                        end of look-ahead

--------------------------------------------------------------------------------

      [\w\W]                   any character of: word characters (a-

                               z, A-Z, 0-9, _), non-word characters

                               (all but a-z, A-Z, 0-9, _)

--------------------------------------------------------------------------------

    )*?                      end of grouping

--------------------------------------------------------------------------------

    "@type":                 '"@type":'

--------------------------------------------------------------------------------

    \s*                      whitespace (\n, \r, \t, \f, and " ") (0

                             or more times (matching the most amount

                             possible))

--------------------------------------------------------------------------------

    "NewsArticle"            '"NewsArticle"'

--------------------------------------------------------------------------------

    [\w\W]*?                 any character of: word characters (a-z,

                             A-Z, 0-9, _), non-word characters (all

                             but a-z, A-Z, 0-9, _) (0 or more times

                             (matching the least amount possible))

--------------------------------------------------------------------------------

  )                        end of \1

--------------------------------------------------------------------------------

  <                        '<'

--------------------------------------------------------------------------------

  \/                       '/'

--------------------------------------------------------------------------------

  script>                  'script>'


查看完整回答
反对 回复 2023-07-14
  • 1 回答
  • 0 关注
  • 164 浏览
慕课专栏
更多

添加回答

举报

0/150
提交
取消
意见反馈 帮助中心 APP下载
官方微信