zoukankan      html  css  js  c++  java
  • topic 知乎

    using HtmlAgilityPack;
    using Newtonsoft.Json;
    using Newtonsoft.Json.Linq;
    using System.Collections;
    using System.IO;
    
    namespace EasySpider
    {
        public class ReadZhihu
        {
            public static void FormatDocument(string document)
            {
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(document);
                ////*[@id="FreeDefinePlaceholderControl1"]
                var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id="zh-topic-organize-page-children"]");
                var liNodes = singleNode.SelectNodes(".//a[@name="topic"]");
                //foreach (var item in liNodes)
                //{
                //    string name = item.InnerText;
                //    string url = item.GetAttributeValue("href", string.Empty);
                //    WriteData(url, name);
                //}
    
                //get structure
                var level4 = singleNode.SelectNodes(".//ul/li/ul/li/ul/li/ul/li/a[@name="topic"]");
    
                foreach (var item in level4)
                {
                    //string l4Url = item.GetAttributeValue("href", string.Empty);
                    //string l4Text = item.InnerText;
    
                    //var l3Node = item.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]");
                    //string l3Url = l3Node.GetAttributeValue("href", string.Empty);
                    //string l3Text = l3Node.InnerText;
    
                    //var l2Node = item.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]");
                    //string l2Text = l2Node.InnerText;
                    //string l2Url = l2Node.GetAttributeValue("href", string.Empty);
    
                    //var l1Node = item.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]");
                    //string l1Text = l1Node.InnerText;
                    //string l1Url = l1Node.GetAttributeValue("href", string.Empty);
                    JArray structure = new JArray();
                    //structure.Add(GenereateObject(l1Url, l1Text));
                    //structure.Add(GenereateObject(l2Url, l2Text));
                    //structure.Add(GenereateObject(l3Url, l3Text));
                    //structure.Add(GenereateObject(l4Url, l4Text));
    
                    Stack s = new Stack();
                    
                    GetParentNode(item, ref s);
                    int count = s.Count;
                    while(count != 0)
                    {
                        structure.Add(s.Pop());
                        count--;
                    }
                    
                    WriteData(structure, @"D:学科Struct.json");
                }
    
    
            }
            public static void GetParentNode(HtmlNode node, ref Stack s)
            {
                string url = node.GetAttributeValue("href", string.Empty);
                string topic = node.InnerHtml;
     
                if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return;
                if (s.Count > 0 && ((JObject)s.Peek())["topic"].ToString() == topic) return;
                s.Push(GenereateObject(url, topic));
    
                if (node.ParentNode != null && node.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]") != null)
                {
                    GetParentNode(node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"), ref s);
                }
                else
                    return;
            }
            public static JObject GenereateObject(string url, string topic)
            {
                JObject obj = new JObject();
                obj.Add("topic", topic);
                obj.Add("url", url);
                return obj;
            }
    
            public static void WriteData(object obj, string fileName)
            {
               
    
                //JObject QNA = (JObject)question;
                //string qus = QNA["Question"].ToString();
                //string ans = QNA["QuesDetail"].ToString();
                //string anstemp = string.Empty;
                //foreach (var item in ans.Split(new char[] { '
    ', '
    ' }))
                //{
                //    if (string.IsNullOrEmpty(item) || item.Contains("本页面内容供您参考"))
                //        continue;
                //    anstemp += item.Trim() + " ";
                //}
    
                //JObject obj = new JObject();
                //obj.Add("Question", qus);
                //obj.Add("Answer", anstemp.Trim());
                //ICBCQNA QNA = (ICBCQNA)question;
    
                string json = JsonConvert.SerializeObject(obj);
    
                string QnaPath = fileName;//文件存放路径,保证文件存在
    
                if (!File.Exists(QnaPath))
                {
                    File.Create(QnaPath);
    
                }
    
                using (StreamWriter sw = new StreamWriter(QnaPath, true))
                {
                    sw.WriteLine(json);
                }
    
    
            }
        }
    }
    using Newtonsoft.Json;
    using Newtonsoft.Json.Linq;
    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    
    namespace FormatDocument
    {
        class Program
        {
            static void Main(string[] args)
            {
                //read file
                string path = @"D:学科Struct.json";
                StreamReader sr = new StreamReader(path, Encoding.UTF8);
                String line;
                int i = 0; int j = 0;
                while ((line = sr.ReadLine()) != null)
                {
                    Console.WriteLine("------------------readline: {0}------------------",++i);
                    WriteData(line, @"D:	opic.json",j);
    
                }
            }
    
            public static void WriteData(string row, string fileName, int j)
            {
                Console.WriteLine("-----------write data begin -----------");
    
                string QnaPath = fileName;//文件存放路径,保证文件存在
                JArray item = JArray.Parse(row);          
                  JArray outArray = new JArray();
                    if (item.Count >= 4)
                    {
                        for (int i = 0; i < 4; i++)
                            outArray.Add(item[i]);
                        string json = JsonConvert.SerializeObject(outArray);
                        string temp = File.ReadAllText(fileName);
                        if (!temp.Contains(json))
                        {
                            using (StreamWriter sw = File.AppendText(fileName))
                            {
                                Console.WriteLine("-----------insert {0} row -----------",++j);
                                sw.WriteLine(json);
                            }
                        }
                    }
    
            }
        }
    }
    I'm fine, it's ok
  • 相关阅读:
    Element filtername is not allowed here-web.xml version="3.0"-intellij idea
    探究JavaScript闭包
    telnet的安装和使用
    Oracle数据库常用的sql语句
    centos6上安装jenkins
    idea的maven项目不知道为啥下载不下来jar包,看本地仓库只是下载了一下xml文件,没有jar包问题
    Oracle数据库使用mybatis的时候,实体类日期为Date类型,mybatis里面定义的是Date类型,插入的时候,时分秒全部是12:00:00问题
    maven打包某个分支的包
    maven打包到私服,打的是war包,好郁闷
    多线程初学习
  • 原文地址:https://www.cnblogs.com/skywss27/p/9991048.html
Copyright © 2011-2022 走看看