1 using System; 2 using System.Collections; 3 using System.Collections.Generic; 4 using System.IO; 5 using System.Linq; 6 using System.Net; 7 using System.Text; 8 using System.Text.RegularExpressions; 9 using System.Web; 10 using System.Web.Mvc; 11 12 namespace TestInsect.Controllers 13 { 14 public class CrawlerController : Controller 15 { 16 // GET: Crawler 17 public ActionResult Index() 18 { 19 Index1(); 20 return View(); 21 } 22 // GET: Crawler 23 public void Index1() 24 { 25 //抓取整本小说 26 CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说 27 string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", ""); 28 29 // 获取小说名字 30 Match ma_name = Regex.Match(html, @" "); 31 string name = ma_name.Groups[1].Value.ToString().Split(',')[0]; 32 33 // 获取章节目录 34 Regex reg_mulu = new Regex(@"
"); 35 var mat_mulu = reg_mulu.Match(html); 36 string mulu = mat_mulu.Groups[0].ToString(); 37 38 // 匹配a标签里面的url 39 Regex tmpreg = new Regex("
]+?href=\"([^\"]+)\"[^>]*>([^<]+)", RegexOptions.Compiled); 40 MatchCollection sMC = tmpreg.Matches(mulu); 41 if (sMC.Count != 0) 42 { 43 //循环目录url,获取正文内容 44 for (int i = 0; i < sMC.Count; i++) 45 { 46 //sMC[i].Groups[1].Value 47 //0是 第一章 泰山之巅 48 //1是http://www.23us.so/files/article/html/13/13655/5638725.html 49 //2是第一章 泰山之巅 50 51 // 获取章节标题 52 string title = sMC[i].Groups[2].Value; 53 54 // 获取文章内容 55 string html_z = cra.HttpGet(sMC[i].Groups[1].Value, ""); 56 57 // 获取小说名字,章节中也可以查找名字 58 //Match ma_name = Regex.Match(html, @"
"); 59 //string name = ma_name.Groups[1].Value.ToString().Split(',')[0]; 60 61 // 获取标题,通过分析h1标签也可以得到章节标题 62 //string title = html_z.Replace("
", "*").Replace("
", "*").Split('*')[1]; 63 64 // 获取正文 65 Regex reg = new Regex(@"
(.|\n)*? "); 66 MatchCollection mc = reg.Matches(html_z); 67 var mat = reg.Match(html_z); 68 string content = mat.Groups[0].ToString().Replace("
", "").Replace(" ", "").Replace(" ", "").Replace("
", "\r\n"); 69 70 // txt文本输出 71 string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\\", "/") + "Txt/"; 72 Novel(title + "\r\n" + content, name, path); 73 } 74 } 75 } 76 77 ///
78 /// 创建文本 79 /// 80 ///
内容 81 ///
名字 82 ///
路径 83 public void Novel(string content, string name, string path) 84 { 85 string Log = content + "\r\n"; 86 // 创建文件夹,如果不存在就创建file文件夹 87 if (Directory.Exists(path) == false) 88 { 89 Directory.CreateDirectory(path); 90 } 91 92 // 判断文件是否存在,不存在则创建 93 if (!System.IO.File.Exists(path + name + ".txt")) 94 { 95 FileStream fs1 = new FileStream(path + name + ".txt", FileMode.Create, FileAccess.Write);// 创建写入文件 96 StreamWriter sw = new StreamWriter(fs1); 97 sw.WriteLine(Log);// 开始写入值 98 sw.Close(); 99 fs1.Close();100 }101 else102 {103 FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);104 StreamWriter sr = new StreamWriter(fs);105 sr.WriteLine(Log);// 开始写入值106 sr.Close();107 fs.Close();108 }109 }110 111 public string HttpPost(string Url, string postDataStr)112 {113 CookieContainer cookie = new CookieContainer();114 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);115 request.Method = "POST";116 request.ContentType = "application/x-www-form-urlencoded";117 request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);118 request.CookieContainer = cookie;119 Stream myRequestStream = request.GetRequestStream();120 StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));121 myStreamWriter.Write(postDataStr);122 myStreamWriter.Close();123 124 HttpWebResponse response = (HttpWebResponse)request.GetResponse();125 126 response.Cookies = cookie.GetCookies(response.ResponseUri);127 Stream myResponseStream = response.GetResponseStream();128 StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));129 string retString = myStreamReader.ReadToEnd();130 myStreamReader.Close();131 myResponseStream.Close();132 133 return retString;134 }135 136 public string HttpGet(string Url, string postDataStr)137 {138 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);139 request.Method = "GET";140 HttpWebResponse response;141 request.ContentType = "text/html;charset=UTF-8";142 try143 {144 response = (HttpWebResponse)request.GetResponse();145 }146 catch (WebException ex)147 {148 response = (HttpWebResponse)request.GetResponse();149 }150 151 Stream myResponseStream = response.GetResponseStream();152 StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));153 string retString = myStreamReader.ReadToEnd();154 myStreamReader.Close();155 myResponseStream.Close();156 157 return retString;158 }159 }160 }